870 files changed, 45428 insertions, 49732 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 795233702a4e..814ac4e213a8 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -9,6 +9,8 @@ config 9P_FS
          If unsure, say N.
+if 9P_FS
 config 9P_FSCACHE
        bool "Enable 9P client caching support (EXPERIMENTAL)"
        depends on EXPERIMENTAL
@@ -17,3 +19,17 @@ config 9P_FSCACHE
          Choose Y here to enable persistent, read-only local
          caching support for 9p clients using FS-Cache
+config 9P_FS_POSIX_ACL
+        bool "9P POSIX Access Control Lists"
+        select FS_POSIX_ACL
+        help
+          POSIX Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the POSIX ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N
+endif
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 91fba025fcbe..ab8c12780634 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
 9p-objs := \
        vfs_super.o \
        vfs_inode.o \
+        vfs_inode_dotl.o \
        vfs_addr.o \
        vfs_file.o \
        vfs_dir.o \
@@ -13,3 +14,4 @@ obj-$(CONFIG_9P_FS) := 9p.o
        xattr_user.o
 9p-$(CONFIG_9P_FSCACHE) += cache.o
+9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
new file mode 100644
index 000000000000..02a2cf616318
--- /dev/null
+++ b/fs/9p/acl.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/posix_acl_xattr.h>
+#include "xattr.h"
+#include "acl.h"
+#include "v9fs_vfs.h"
+#include "v9fs.h"
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+{
+        ssize_t size;
+        void *value = NULL;
+        struct posix_acl *acl = NULL;
+        size = v9fs_fid_xattr_get(fid, name, NULL, 0);
+        if (size > 0) {
+                value = kzalloc(size, GFP_NOFS);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                size = v9fs_fid_xattr_get(fid, name, value, size);
+                if (size > 0) {
+                        acl = posix_acl_from_xattr(value, size);
+                        if (IS_ERR(acl))
+                                goto err_out;
+                }
+        } else if (size == -ENODATA || size == 0 ||
+                   size == -ENOSYS || size == -EOPNOTSUPP) {
+                acl = NULL;
+        } else
+                acl = ERR_PTR(-EIO);
+err_out:
+        kfree(value);
+        return acl;
+}
+int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+        int retval = 0;
+        struct posix_acl *pacl, *dacl;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+                set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
+                set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
+                return 0;
+        }
+        /* get the default/access acl values and cache them */
+        dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
+        pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+        if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
+                set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+                set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
+                posix_acl_release(dacl);
+                posix_acl_release(pacl);
+        } else
+                retval = -EIO;
+        return retval;
+}
+static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
+{
+        struct posix_acl *acl;
+        /*
+         * 9p Always cache the acl value when
+         * instantiating the inode (v9fs_inode_from_fid)
+         */
+        acl = get_cached_acl(inode, type);
+        BUG_ON(acl == ACL_NOT_CACHED);
+        return acl;
+}
+int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+        struct posix_acl *acl;
+        struct v9fs_session_info *v9ses;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        v9ses = v9fs_inode2v9ses(inode);
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+                /*
+                 * On access = client mode get the acl
+                 * values from the server
+                 */
+                return 0;
+        }
+        acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                int error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return error;
+        }
+        return -EAGAIN;
+}
+static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
+{
+        int retval;
+        char *name;
+        size_t size;
+        void *buffer;
+        struct inode *inode = dentry->d_inode;
+        set_cached_acl(inode, type, acl);
+        /* Set a setxattr request to server */
+        size = posix_acl_xattr_size(acl->a_count);
+        buffer = kmalloc(size, GFP_KERNEL);
+        if (!buffer)
+                return -ENOMEM;
+        retval = posix_acl_to_xattr(acl, buffer, size);
+        if (retval < 0)
+                goto err_free_out;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = POSIX_ACL_XATTR_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
+err_free_out:
+        kfree(buffer);
+        return retval;
+}
+int v9fs_acl_chmod(struct dentry *dentry)
+{
+        int retval = 0;
+        struct posix_acl *acl, *clone;
+        struct inode *inode = dentry->d_inode;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (acl) {
+                clone = posix_acl_clone(acl, GFP_KERNEL);
+                posix_acl_release(acl);
+                if (!clone)
+                        return -ENOMEM;
+                retval = posix_acl_chmod_masq(clone, inode->i_mode);
+                if (!retval)
+                        retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone);
+                posix_acl_release(clone);
+        }
+        return retval;
+}
+int v9fs_set_create_acl(struct dentry *dentry,
+                        struct posix_acl *dpacl, struct posix_acl *pacl)
+{
+        if (dpacl)
+                v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
+        if (pacl)
+                v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
+        posix_acl_release(dpacl);
+        posix_acl_release(pacl);
+        return 0;
+}
+int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+                  struct posix_acl **dpacl, struct posix_acl **pacl)
+{
+        int retval = 0;
+        mode_t mode = *modep;
+        struct posix_acl *acl = NULL;
+        if (!S_ISLNK(mode)) {
+                acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                if (!acl)
+                        mode &= ~current_umask();
+        }
+        if (acl) {
+                struct posix_acl *clone;
+                if (S_ISDIR(mode))
+                        *dpacl = acl;
+                clone = posix_acl_clone(acl, GFP_NOFS);
+                retval = -ENOMEM;
+                if (!clone)
+                        goto cleanup;
+                retval = posix_acl_create_masq(clone, &mode);
+                if (retval < 0) {
+                        posix_acl_release(clone);
+                        goto cleanup;
+                }
+                if (retval > 0)
+                        *pacl = clone;
+        }
+        *modep  = mode;
+        return 0;
+cleanup:
+        posix_acl_release(acl);
+        return retval;
+}
+static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
+                               void *buffer, size_t size, int type)
+{
+        char *full_name;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                full_name =  POSIX_ACL_XATTR_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                full_name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
+                              void *buffer, size_t size, int type)
+{
+        struct v9fs_session_info *v9ses;
+        struct posix_acl *acl;
+        int error;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        /*
+         * We allow set/get/list of acl when access=client is not specified
+         */
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+                return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+        acl = v9fs_get_cached_acl(dentry->d_inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        error = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
+                              const void *value, size_t size,
+                              int flags, int type)
+{
+        char *full_name;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                full_name =  POSIX_ACL_XATTR_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                full_name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
+                              const void *value, size_t size,
+                              int flags, int type)
+{
+        int retval;
+        struct posix_acl *acl;
+        struct v9fs_session_info *v9ses;
+        struct inode *inode = dentry->d_inode;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        /*
+         * set the attribute on the remote. Without even looking at the
+         * xattr value. We leave it to the server to validate
+         */
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+                return v9fs_remote_set_acl(dentry, name,
+                                           value, size, flags, type);
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EPERM;
+        if (value) {
+                /* update the cached acl value */
+                acl = posix_acl_from_xattr(value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                else if (acl) {
+                        retval = posix_acl_valid(acl);
+                        if (retval)
+                                goto err_out;
+                }
+        } else
+                acl = NULL;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = POSIX_ACL_XATTR_ACCESS;
+                if (acl) {
+                        mode_t mode = inode->i_mode;
+                        retval = posix_acl_equiv_mode(acl, &mode);
+                        if (retval < 0)
+                                goto err_out;
+                        else {
+                                struct iattr iattr;
+                                if (retval == 0) {
+                                        /*
+                                         * ACL can be represented
+                                         * by the mode bits. So don't
+                                         * update ACL.
+                                         */
+                                        acl = NULL;
+                                        value = NULL;
+                                        size = 0;
+                                }
+                                /* Updte the mode bits */
+                                iattr.ia_mode = ((mode & S_IALLUGO) |
+                                                 (inode->i_mode & ~S_IALLUGO));
+                                iattr.ia_valid = ATTR_MODE;
+                                /* FIXME should we update ctime ?
+                                 * What is the following setxattr update the
+                                 * mode ?
+                                 */
+                                v9fs_vfs_setattr_dotl(dentry, &iattr);
+                        }
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                name = POSIX_ACL_XATTR_DEFAULT;
+                if (!S_ISDIR(inode->i_mode)) {
+                        retval = acl ? -EINVAL : 0;
+                        goto err_out;
+                }
+                break;
+        default:
+                BUG();
+        }
+        retval = v9fs_xattr_set(dentry, name, value, size, flags);
+        if (!retval)
+                set_cached_acl(inode, type, acl);
+err_out:
+        posix_acl_release(acl);
+        return retval;
+}
+const struct xattr_handler v9fs_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
+        .get    = v9fs_xattr_get_acl,
+        .set    = v9fs_xattr_set_acl,
+};
+const struct xattr_handler v9fs_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
+        .get    = v9fs_xattr_get_acl,
+        .set    = v9fs_xattr_set_acl,
+};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
new file mode 100644
index 000000000000..7ef3ac9f6d95
--- /dev/null
+++ b/fs/9p/acl.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_ACL_H
+#define FS_9P_ACL_H
+#ifdef CONFIG_9P_FS_POSIX_ACL
+extern int v9fs_get_acl(struct inode *, struct p9_fid *);
+extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
+extern int v9fs_acl_chmod(struct dentry *);
+extern int v9fs_set_create_acl(struct dentry *,
+                               struct posix_acl *, struct posix_acl *);
+extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+                         struct posix_acl **dpacl, struct posix_acl **pacl);
+#else
+#define v9fs_check_acl NULL
+static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+        return 0;
+}
+static inline int v9fs_acl_chmod(struct dentry *dentry)
+{
+        return 0;
+}
+static inline int v9fs_set_create_acl(struct dentry *dentry,
+                                      struct posix_acl *dpacl,
+                                      struct posix_acl *pacl)
+{
+        return 0;
+}
+static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+                                struct posix_acl **dpacl,
+                                struct posix_acl **pacl)
+{
+        return 0;
+}
+#endif
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 6406f896bf95..b00223c99d70 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -149,6 +149,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
        switch (access) {
        case V9FS_ACCESS_SINGLE:
        case V9FS_ACCESS_USER:
+        case V9FS_ACCESS_CLIENT:
                uid = current_fsuid();
                any = 0;
                break;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 38dc0e067599..2f77cd33ba83 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -193,7 +193,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                                v9ses->flags |= V9FS_ACCESS_USER;
                        else if (strcmp(s, "any") == 0)
                                v9ses->flags |= V9FS_ACCESS_ANY;
-                        else {
+                        else if (strcmp(s, "client") == 0) {
+#ifdef CONFIG_9P_FS_POSIX_ACL
+                                v9ses->flags |= V9FS_ACCESS_CLIENT;
+#else
+                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                        "access=client option not supported\n");
+                                kfree(s);
+                                ret = -EINVAL;
+                                goto free_and_return;
+#endif
+                        } else {
                                v9ses->flags |= V9FS_ACCESS_SINGLE;
                                v9ses->uid = simple_strtoul(s, &e, 10);
                                if (*e != '\0')
@@ -278,6 +288,16 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
+        if (!v9fs_proto_dotl(v9ses) &&
+            ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+                /*
+                 * We support ACCESS_CLIENT only for dotl.
+                 * Fall back to ACCESS_USER
+                 */
+                v9ses->flags &= ~V9FS_ACCESS_MASK;
+                v9ses->flags |= V9FS_ACCESS_USER;
+        }
+        /*FIXME !! */
        /* for legacy mode, fall back to V9FS_ACCESS_ANY */
        if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
                ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 4c963c9fc41f..c4b5d8864f0d 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -33,13 +33,17 @@
 *
 * Session flags reflect options selected by users at mount time
 */
+#define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
+                         V9FS_ACCESS_USER |   \
+                         V9FS_ACCESS_CLIENT)
+#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
 enum p9_session_flags {
        V9FS_PROTO_2000U        = 0x01,
        V9FS_PROTO_2000L        = 0x02,
        V9FS_ACCESS_SINGLE      = 0x04,
        V9FS_ACCESS_USER        = 0x08,
-        V9FS_ACCESS_ANY         = 0x0C,
+        V9FS_ACCESS_CLIENT      = 0x10
-        V9FS_ACCESS_MASK        = 0x0C,
 };
 /* possible values of ->cache */
@@ -109,11 +113,27 @@ struct v9fs_session_info {
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
-void v9fs_session_close(struct v9fs_session_info *v9ses);
+extern void v9fs_session_close(struct v9fs_session_info *v9ses);
-void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
-void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+                        struct nameidata *nameidata);
+extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry);
+extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
+                        void *p);
+extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
+                        struct p9_fid *fid,
+                        struct super_block *sb);
-#define V9FS_MAGIC 0x01021997
+extern const struct inode_operations v9fs_dir_inode_operations_dotl;
+extern const struct inode_operations v9fs_file_inode_operations_dotl;
+extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
+                        struct p9_fid *fid,
+                        struct super_block *sb);
 /* other default globals */
 #define V9FS_PORT       564
@@ -136,3 +156,21 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 {
        return v9ses->flags & V9FS_PROTO_2000L;
 }
+/**
+ * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                                struct super_block *sb)
+{
+        if (v9fs_proto_dotl(v9ses))
+                return v9fs_inode_dotl(v9ses, fid, sb);
+        else
+                return v9fs_inode(v9ses, fid, sb);
+}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 88418c419ea7..b789f8e597ec 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -59,8 +59,11 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
-void v9fs_dentry_release(struct dentry *);
 int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
+int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
+int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+#define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 90e38449f4b3..b7f2a8e3863e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -154,10 +154,40 @@ static int v9fs_launder_page(struct page *page)
        return 0;
 }
+/**
+ * v9fs_direct_IO - 9P address space operation for direct I/O
+ * @rw: direction (read or write)
+ * @iocb: target I/O control block
+ * @iov: array of vectors that define I/O buffer
+ * @pos: offset in file to begin the operation
+ * @nr_segs: size of iovec array
+ *
+ * The presence of v9fs_direct_IO() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode, we shunt off direct read and write requests before
+ * the VFS gets them, so this method should never be called.
+ *
+ * Direct IO is not 'yet' supported in the cached mode. Hence when
+ * this routine is called through generic_file_aio_read(), the read/write fails
+ * with an error.
+ *
+ */
+ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+                loff_t pos, unsigned long nr_segs)
+{
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
+                        "off/no(%lld/%lu) EINVAL\n",
+                        iocb->ki_filp->f_path.dentry->d_name.name,
+                        (long long) pos, nr_segs);
+        return -EINVAL;
+}
 const struct address_space_operations v9fs_addr_operations = {
      .readpage = v9fs_vfs_readpage,
      .readpages = v9fs_vfs_readpages,
      .releasepage = v9fs_release_page,
      .invalidatepage = v9fs_invalidate_page,
      .launder_page = v9fs_launder_page,
+      .direct_IO = v9fs_direct_IO,
 };
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index cbf4e50f3933..233b7d4ffe5e 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -51,7 +51,7 @@
 *
 */
-static int v9fs_dentry_delete(struct dentry *dentry)
+static int v9fs_dentry_delete(const struct dentry *dentry)
 {
        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
                                                                        dentry);
@@ -68,7 +68,7 @@ static int v9fs_dentry_delete(struct dentry *dentry)
 *
 */
-static int v9fs_cached_dentry_delete(struct dentry *dentry)
+static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
@@ -86,7 +86,7 @@ static int v9fs_cached_dentry_delete(struct dentry *dentry)
 *
 */
-void v9fs_dentry_release(struct dentry *dentry)
+static void v9fs_dentry_release(struct dentry *dentry)
 {
        struct v9fs_dentry *dent;
        struct p9_fid *temp, *current_fid;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 899f168fd19c..b84ebe8cefed 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -242,7 +242,8 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
                while (rdir->head < rdir->tail) {
                        err = p9dirent_read(rdir->buf + rdir->head,
-                                                buflen - rdir->head, &curdirent,
+                                                rdir->tail - rdir->head,
+                                                &curdirent,
                                                fid->clnt->proto_version);
                        if (err < 0) {
                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
@@ -314,4 +315,5 @@ const struct file_operations v9fs_dir_operations_dotl = {
        .readdir = v9fs_dir_readdir_dotl,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
+        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index e97c92bd6f16..240c30674396 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -33,6 +33,7 @@
 #include <linux/inet.h>
 #include <linux/list.h>
 #include <linux/pagemap.h>
+#include <linux/utsname.h>
 #include <asm/uaccess.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
@@ -44,6 +45,7 @@
 #include "cache.h"
 static const struct file_operations v9fs_cached_file_operations;
+static const struct file_operations v9fs_cached_file_operations_dotl;
 /**
 * v9fs_file_open - open a file (or directory)
@@ -92,6 +94,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                /* enable cached file options */
                if(file->f_op == &v9fs_file_operations)
                        file->f_op = &v9fs_cached_file_operations;
+                else if (file->f_op == &v9fs_file_operations_dotl)
+                        file->f_op = &v9fs_cached_file_operations_dotl;
 #ifdef CONFIG_9P_FSCACHE
                v9fs_cache_inode_set_cookie(inode, file);
@@ -130,6 +134,206 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
        return res;
 }
+static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+        struct p9_flock flock;
+        struct p9_fid *fid;
+        uint8_t status;
+        int res = 0;
+        unsigned char fl_type;
+        fid = filp->private_data;
+        BUG_ON(fid == NULL);
+        if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
+                BUG();
+        res = posix_lock_file_wait(filp, fl);
+        if (res < 0)
+                goto out;
+        /* convert posix lock to p9 tlock args */
+        memset(&flock, 0, sizeof(flock));
+        flock.type = fl->fl_type;
+        flock.start = fl->fl_start;
+        if (fl->fl_end == OFFSET_MAX)
+                flock.length = 0;
+        else
+                flock.length = fl->fl_end - fl->fl_start + 1;
+        flock.proc_id = fl->fl_pid;
+        flock.client_id = utsname()->nodename;
+        if (IS_SETLKW(cmd))
+                flock.flags = P9_LOCK_FLAGS_BLOCK;
+        /*
+         * if its a blocked request and we get P9_LOCK_BLOCKED as the status
+         * for lock request, keep on trying
+         */
+        for (;;) {
+                res = p9_client_lock_dotl(fid, &flock, &status);
+                if (res < 0)
+                        break;
+                if (status != P9_LOCK_BLOCKED)
+                        break;
+                if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
+                        break;
+                schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+        }
+        /* map 9p status to VFS status */
+        switch (status) {
+        case P9_LOCK_SUCCESS:
+                res = 0;
+                break;
+        case P9_LOCK_BLOCKED:
+                res = -EAGAIN;
+                break;
+        case P9_LOCK_ERROR:
+        case P9_LOCK_GRACE:
+                res = -ENOLCK;
+                break;
+        default:
+                BUG();
+        }
+        /*
+         * incase server returned error for lock request, revert
+         * it locally
+         */
+        if (res < 0 && fl->fl_type != F_UNLCK) {
+                fl_type = fl->fl_type;
+                fl->fl_type = F_UNLCK;
+                res = posix_lock_file_wait(filp, fl);
+                fl->fl_type = fl_type;
+        }
+out:
+        return res;
+}
+static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
+{
+        struct p9_getlock glock;
+        struct p9_fid *fid;
+        int res = 0;
+        fid = filp->private_data;
+        BUG_ON(fid == NULL);
+        posix_test_lock(filp, fl);
+        /*
+         * if we have a conflicting lock locally, no need to validate
+         * with server
+         */
+        if (fl->fl_type != F_UNLCK)
+                return res;
+        /* convert posix lock to p9 tgetlock args */
+        memset(&glock, 0, sizeof(glock));
+        glock.type = fl->fl_type;
+        glock.start = fl->fl_start;
+        if (fl->fl_end == OFFSET_MAX)
+                glock.length = 0;
+        else
+                glock.length = fl->fl_end - fl->fl_start + 1;
+        glock.proc_id = fl->fl_pid;
+        glock.client_id = utsname()->nodename;
+        res = p9_client_getlock_dotl(fid, &glock);
+        if (res < 0)
+                return res;
+        if (glock.type != F_UNLCK) {
+                fl->fl_type = glock.type;
+                fl->fl_start = glock.start;
+                if (glock.length == 0)
+                        fl->fl_end = OFFSET_MAX;
+                else
+                        fl->fl_end = glock.start + glock.length - 1;
+                fl->fl_pid = glock.proc_id;
+        } else
+                fl->fl_type = F_UNLCK;
+        return res;
+}
+/**
+ * v9fs_file_lock_dotl - lock a file (or directory)
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        int ret = -ENOLCK;
+        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+                                cmd, fl, filp->f_path.dentry->d_name.name);
+        /* No mandatory locks */
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+                goto out_err;
+        if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+                filemap_write_and_wait(inode->i_mapping);
+                invalidate_mapping_pages(&inode->i_data, 0, -1);
+        }
+        if (IS_SETLK(cmd) || IS_SETLKW(cmd))
+                ret = v9fs_file_do_lock(filp, cmd, fl);
+        else if (IS_GETLK(cmd))
+                ret = v9fs_file_getlock(filp, fl);
+        else
+                ret = -EINVAL;
+out_err:
+        return ret;
+}
+/**
+ * v9fs_file_flock_dotl - lock a file
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+static int v9fs_file_flock_dotl(struct file *filp, int cmd,
+        struct file_lock *fl)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        int ret = -ENOLCK;
+        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+                                cmd, fl, filp->f_path.dentry->d_name.name);
+        /* No mandatory locks */
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+                goto out_err;
+        if (!(fl->fl_flags & FL_FLOCK))
+                goto out_err;
+        if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+                filemap_write_and_wait(inode->i_mapping);
+                invalidate_mapping_pages(&inode->i_data, 0, -1);
+        }
+        /* Convert flock to posix lock */
+        fl->fl_owner = (fl_owner_t)filp;
+        fl->fl_start = 0;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_flags |= FL_POSIX;
+        fl->fl_flags ^= FL_FLOCK;
+        if (IS_SETLK(cmd) | IS_SETLKW(cmd))
+                ret = v9fs_file_do_lock(filp, cmd, fl);
+        else
+                ret = -EINVAL;
+out_err:
+        return ret;
+}
 /**
 * v9fs_file_readn - read from a file
 * @filp: file pointer to read
@@ -219,7 +423,9 @@ static ssize_t
 v9fs_file_write(struct file *filp, const char __user * data,
                size_t count, loff_t * offset)
 {
-        int n, rsize, total = 0;
+        ssize_t retval;
+        size_t total = 0;
+        int n;
        struct p9_fid *fid;
        struct p9_client *clnt;
        struct inode *inode = filp->f_path.dentry->d_inode;
@@ -232,14 +438,19 @@ v9fs_file_write(struct file *filp, const char __user * data,
        fid = filp->private_data;
        clnt = fid->clnt;
-        rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ;
+        retval = generic_write_checks(filp, &origin, &count, 0);
+        if (retval)
+                goto out;
-        do {
+        retval = -EINVAL;
-                if (count < rsize)
+        if ((ssize_t) count < 0)
-                        rsize = count;
+                goto out;
+        retval = 0;
+        if (!count)
+                goto out;
-                n = p9_client_write(fid, NULL, data+total, origin+total,
+        do {
-                                                                        rsize);
+                n = p9_client_write(fid, NULL, data+total, origin+total, count);
                if (n <= 0)
                        break;
                count -= n;
@@ -258,9 +469,11 @@ v9fs_file_write(struct file *filp, const char __user * data,
        }
        if (n < 0)
-                return n;
+                retval = n;
+        else
-        return total;
+                retval = total;
+out:
+        return retval;
 }
 static int v9fs_file_fsync(struct file *filp, int datasync)
@@ -278,6 +491,20 @@ static int v9fs_file_fsync(struct file *filp, int datasync)
        return retval;
 }
+int v9fs_file_fsync_dotl(struct file *filp, int datasync)
+{
+        struct p9_fid *fid;
+        int retval;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
+                        filp, datasync);
+        fid = filp->private_data;
+        retval = p9_client_fsync(fid, datasync);
+        return retval;
+}
 static const struct file_operations v9fs_cached_file_operations = {
        .llseek = generic_file_llseek,
        .read = do_sync_read,
@@ -290,6 +517,19 @@ static const struct file_operations v9fs_cached_file_operations = {
        .fsync = v9fs_file_fsync,
 };
+static const struct file_operations v9fs_cached_file_operations_dotl = {
+        .llseek = generic_file_llseek,
+        .read = do_sync_read,
+        .aio_read = generic_file_aio_read,
+        .write = v9fs_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock_dotl,
+        .flock = v9fs_file_flock_dotl,
+        .mmap = generic_file_readonly_mmap,
+        .fsync = v9fs_file_fsync_dotl,
+};
 const struct file_operations v9fs_file_operations = {
        .llseek = generic_file_llseek,
        .read = v9fs_file_read,
@@ -307,7 +547,8 @@ const struct file_operations v9fs_file_operations_dotl = {
        .write = v9fs_file_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
-        .lock = v9fs_file_lock,
+        .lock = v9fs_file_lock_dotl,
+        .flock = v9fs_file_flock_dotl,
        .mmap = generic_file_readonly_mmap,
-        .fsync = v9fs_file_fsync,
+        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9e670d527646..b76a40bdf4c2 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -36,6 +36,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -44,14 +45,12 @@
 #include "fid.h"
 #include "cache.h"
 #include "xattr.h"
+#include "acl.h"
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
-static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
-static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
-static const struct inode_operations v9fs_symlink_inode_operations_dotl;
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -231,46 +230,18 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 *
 */
-void v9fs_destroy_inode(struct inode *inode)
+static void v9fs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
 }
-#endif
-/**
- * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
- * new file system object. This checks the S_ISGID to determine the owning
- * group of the new file system object.
- */
-static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
-{
-        BUG_ON(dir_inode == NULL);
-        if (dir_inode->i_mode & S_ISGID) {
-                /* set_gid bit is set.*/
-                return dir_inode->i_gid;
-        }
-        return current_fsgid();
-}
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+void v9fs_destroy_inode(struct inode *inode)
 {
-        struct dentry *dentry;
+        call_rcu(&inode->i_rcu, v9fs_i_callback);
-        spin_lock(&dcache_lock);
-        /* Directory should have only one entry. */
-        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
-        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-        spin_unlock(&dcache_lock);
-        return dentry;
 }
+#endif
 /**
 * v9fs_get_inode - helper function to setup an inode
@@ -441,7 +412,7 @@ void v9fs_evict_inode(struct inode *inode)
 #endif
 }
-static struct inode *
+struct inode *
 v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        struct super_block *sb)
 {
@@ -476,55 +447,6 @@ error:
        return ERR_PTR(err);
 }
-static struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-        struct super_block *sb)
-{
-        struct inode *ret = NULL;
-        int err;
-        struct p9_stat_dotl *st;
-        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-        if (IS_ERR(st))
-                return ERR_CAST(st);
-        ret = v9fs_get_inode(sb, st->st_mode);
-        if (IS_ERR(ret)) {
-                err = PTR_ERR(ret);
-                goto error;
-        }
-        v9fs_stat2inode_dotl(st, ret);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
-#endif
-        kfree(st);
-        return ret;
-error:
-        kfree(st);
-        return ERR_PTR(err);
-}
-/**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
- * issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-                        struct super_block *sb)
-{
-        if (v9fs_proto_dotl(v9ses))
-                return v9fs_inode_dotl(v9ses, fid, sb);
-        else
-                return v9fs_inode(v9ses, fid, sb);
-}
 /**
 * v9fs_remove - helper function to remove files and directories
 * @dir: directory inode that is being deleted
@@ -553,13 +475,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
        return retval;
 }
-static int
-v9fs_open_created(struct inode *inode, struct file *file)
-{
-        return 0;
-}
 /**
 * v9fs_create - Create a file
 * @v9ses: session information
@@ -622,12 +537,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
-        if (v9ses->cache)
-                dentry->d_op = &v9fs_cached_dentry_operations;
-        else
-                dentry->d_op = &v9fs_dentry_operations;
        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
        if (err < 0)
@@ -646,121 +555,6 @@ error:
 }
 /**
- * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
- * @dir: directory inode that is being created
- * @dentry:  dentry that is being deleted
- * @mode: create permissions
- * @nd: path information
- *
- */
-static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
-                struct nameidata *nd)
-{
-        int err = 0;
-        char *name = NULL;
-        gid_t gid;
-        int flags;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL;
-        struct p9_fid *dfid, *ofid;
-        struct file *filp;
-        struct p9_qid qid;
-        struct inode *inode;
-        v9ses = v9fs_inode2v9ses(dir);
-        if (nd && nd->flags & LOOKUP_OPEN)
-                flags = nd->intent.open.flags - 1;
-        else
-                flags = O_RDWR;
-        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-                        "mode:0x%x\n", name, flags, mode);
-        dfid = v9fs_fid_lookup(dentry->d_parent);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                return err;
-        }
-        /* clone a fid to use for creation */
-        ofid = p9_client_walk(dfid, 0, NULL, 1);
-        if (IS_ERR(ofid)) {
-                err = PTR_ERR(ofid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                return err;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                                "p9_client_open_dotl failed in creat %d\n",
-                                err);
-                goto error;
-        }
-        /* No need to populate the inode if we are not opening the file AND
-         * not in cached mode.
-         */
-        if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) {
-                /* Not in cached mode. No need to populate inode with stat */
-                dentry->d_op = &v9fs_dentry_operations;
-                p9_client_clunk(ofid);
-                d_instantiate(dentry, NULL);
-                return 0;
-        }
-        /* Now walk from the parent so we can get an unopened fid. */
-        fid = p9_client_walk(dfid, 1, &name, 1);
-        if (IS_ERR(fid)) {
-                err = PTR_ERR(fid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                fid = NULL;
-                goto error;
-        }
-        /* instantiate inode and assign the unopened fid to dentry */
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-        if (IS_ERR(inode)) {
-                err = PTR_ERR(inode);
-                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
-                goto error;
-        }
-        if (v9ses->cache)
-                dentry->d_op = &v9fs_cached_dentry_operations;
-        else
-                dentry->d_op = &v9fs_dentry_operations;
-        d_instantiate(dentry, inode);
-        err = v9fs_fid_add(dentry, fid);
-        if (err < 0)
-                goto error;
-        /* if we are opening a file, assign the open fid to the file */
-        if (nd && nd->flags & LOOKUP_OPEN) {
-                filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
-                if (IS_ERR(filp)) {
-                        p9_client_clunk(ofid);
-                        return PTR_ERR(filp);
-                }
-                filp->private_data = ofid;
-        } else
-                p9_client_clunk(ofid);
-        return 0;
-error:
-        if (ofid)
-                p9_client_clunk(ofid);
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-/**
 * v9fs_vfs_create - VFS hook to create files
 * @dir: directory inode that is being created
 * @dentry:  dentry that is being deleted
@@ -800,7 +594,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        /* if we are opening a file, assign the open fid to the file */
        if (nd && nd->flags & LOOKUP_OPEN) {
-                filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
                        goto error;
@@ -850,83 +644,6 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return err;
 }
-/**
- * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
- * @dir:  inode that is being unlinked
- * @dentry: dentry that is being unlinked
- * @mode: mode for new directory
- *
- */
-static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
-                                        int mode)
-{
-        int err;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL, *dfid = NULL;
-        gid_t gid;
-        char *name;
-        struct inode *inode;
-        struct p9_qid qid;
-        struct dentry *dir_dentry;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
-        err = 0;
-        v9ses = v9fs_inode2v9ses(dir);
-        mode |= S_IFDIR;
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
-                goto error;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        if (gid < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
-                goto error;
-        }
-        name = (char *) dentry->d_name.name;
-        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
-        if (err < 0)
-                goto error;
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        }
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
 /**
 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
 * @dir:  inode that is being walked from
@@ -935,7 +652,7 @@ error:
 *
 */
-static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nameidata)
 {
        struct super_block *sb;
@@ -979,17 +696,14 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        result = v9fs_fid_add(dentry, fid);
        if (result < 0)
-                goto error;
+                goto error_iput;
 inst_out:
-        if (v9ses->cache)
-                dentry->d_op = &v9fs_cached_dentry_operations;
-        else
-                dentry->d_op = &v9fs_dentry_operations;
        d_add(dentry, inode);
        return NULL;
+error_iput:
+        iput(inode);
 error:
        p9_client_clunk(fid);
@@ -1003,7 +717,7 @@ error:
 *
 */
-static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 {
        return v9fs_remove(i, d, 0);
 }
@@ -1015,7 +729,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 *
 */
-static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
        return v9fs_remove(i, d, 1);
 }
@@ -1029,7 +743,7 @@ static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 *
 */
-static int
+int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry)
 {
@@ -1136,42 +850,6 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
-static int
-v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
-                 struct kstat *stat)
-{
-        int err;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
-        struct p9_stat_dotl *st;
-        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-        err = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-                return simple_getattr(mnt, dentry, stat);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        /* Ask for all the fields in stat structure. Server will return
-         * whatever it supports
-         */
-        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
-        if (IS_ERR(st))
-                return PTR_ERR(st);
-        v9fs_stat2inode_dotl(st, dentry->d_inode);
-        generic_fillattr(dentry->d_inode, stat);
-        /* Change block size to what the server returned */
-        stat->blksize = st->st_blksize;
-        kfree(st);
-        return 0;
-}
 /**
 * v9fs_vfs_setattr - set file metadata
 * @dentry: file whose metadata to set
@@ -1231,58 +909,6 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 }
 /**
- * v9fs_vfs_setattr_dotl - set file metadata
- * @dentry: file whose metadata to set
- * @iattr: metadata assignment structure
- *
- */
-static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
-{
-        int retval;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
-        struct p9_iattr_dotl p9attr;
-        P9_DPRINTK(P9_DEBUG_VFS, "\n");
-        retval = inode_change_ok(dentry->d_inode, iattr);
-        if (retval)
-                return retval;
-        p9attr.valid = iattr->ia_valid;
-        p9attr.mode = iattr->ia_mode;
-        p9attr.uid = iattr->ia_uid;
-        p9attr.gid = iattr->ia_gid;
-        p9attr.size = iattr->ia_size;
-        p9attr.atime_sec = iattr->ia_atime.tv_sec;
-        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
-        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
-        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
-        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        retval = p9_client_setattr(fid, &p9attr);
-        if (retval < 0)
-                return retval;
-        if ((iattr->ia_valid & ATTR_SIZE) &&
-            iattr->ia_size != i_size_read(dentry->d_inode)) {
-                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
-                if (retval)
-                        return retval;
-        }
-        setattr_copy(dentry->d_inode, iattr);
-        mark_inode_dirty(dentry->d_inode);
-        return 0;
-}
-/**
 * v9fs_stat2inode - populate an inode structure with mistat info
 * @stat: Plan 9 metadata (mistat) structure
 * @inode: inode to populate
@@ -1360,77 +986,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 }
 /**
- * v9fs_stat2inode_dotl - populate an inode structure with stat info
- * @stat: stat structure
- * @inode: inode to populate
- * @sb: superblock of filesystem
- *
- */
-void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
-{
-        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
-                inode->i_atime.tv_sec = stat->st_atime_sec;
-                inode->i_atime.tv_nsec = stat->st_atime_nsec;
-                inode->i_mtime.tv_sec = stat->st_mtime_sec;
-                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-                inode->i_ctime.tv_sec = stat->st_ctime_sec;
-                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-                inode->i_uid = stat->st_uid;
-                inode->i_gid = stat->st_gid;
-                inode->i_nlink = stat->st_nlink;
-                inode->i_mode = stat->st_mode;
-                inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
-                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
-                i_size_write(inode, stat->st_size);
-                inode->i_blocks = stat->st_blocks;
-        } else {
-                if (stat->st_result_mask & P9_STATS_ATIME) {
-                        inode->i_atime.tv_sec = stat->st_atime_sec;
-                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_MTIME) {
-                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
-                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_CTIME) {
-                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
-                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_UID)
-                        inode->i_uid = stat->st_uid;
-                if (stat->st_result_mask & P9_STATS_GID)
-                        inode->i_gid = stat->st_gid;
-                if (stat->st_result_mask & P9_STATS_NLINK)
-                        inode->i_nlink = stat->st_nlink;
-                if (stat->st_result_mask & P9_STATS_MODE) {
-                        inode->i_mode = stat->st_mode;
-                        if ((S_ISBLK(inode->i_mode)) ||
-                                                (S_ISCHR(inode->i_mode)))
-                                init_special_inode(inode, inode->i_mode,
-                                                                inode->i_rdev);
-                }
-                if (stat->st_result_mask & P9_STATS_RDEV)
-                        inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if (stat->st_result_mask & P9_STATS_SIZE)
-                        i_size_write(inode, stat->st_size);
-                if (stat->st_result_mask & P9_STATS_BLOCKS)
-                        inode->i_blocks = stat->st_blocks;
-        }
-        if (stat->st_result_mask & P9_STATS_GEN)
-                        inode->i_generation = stat->st_gen;
-        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
-         * because the inode structure does not have fields for them.
-         */
-}
-/**
 * v9fs_qid2ino - convert qid into inode number
 * @qid: qid to hash
 *
@@ -1473,7 +1028,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        if (IS_ERR(fid))
                return PTR_ERR(fid);
-        if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))
+        if (!v9fs_proto_dotu(v9ses))
                return -EBADF;
        st = p9_client_stat(fid);
@@ -1536,7 +1091,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 *
 */
-static void
+void
 v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
        char *s = nd_get_link(nd);
@@ -1580,99 +1135,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 }
 /**
- * v9fs_vfs_symlink_dotl - helper function to create symlinks
- * @dir: directory inode containing symlink
- * @dentry: dentry for symlink
- * @symname: symlink data
- *
- * See Also: 9P2000.L RFC for more information
- *
- */
-static int
-v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
-                const char *symname)
-{
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *dfid;
-        struct p9_fid *fid = NULL;
-        struct inode *inode;
-        struct p9_qid qid;
-        char *name;
-        int err;
-        gid_t gid;
-        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
-                        dir->i_ino, name, symname);
-        v9ses = v9fs_inode2v9ses(dir);
-        dfid = v9fs_fid_lookup(dentry->d_parent);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                return err;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        if (gid < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid);
-                goto error;
-        }
-        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
-        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
-                goto error;
-        }
-        if (v9ses->cache) {
-                /* Now walk from the parent so we can get an unopened fid. */
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                        err);
-                        fid = NULL;
-                        goto error;
-                }
-                /* instantiate inode and assign the unopened fid to dentry */
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                        err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /* Not in cached mode. No need to populate inode with stat */
-                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_dentry_operations;
-                d_instantiate(dentry, inode);
-        }
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-/**
 * v9fs_vfs_symlink - helper function to create symlinks
 * @dir: directory inode containing symlink
 * @dentry: dentry for symlink
@@ -1731,76 +1193,6 @@ clunk_fid:
 }
 /**
- * v9fs_vfs_link_dotl - create a hardlink for dotl
- * @old_dentry: dentry for file to link to
- * @dir: inode destination for new link
- * @dentry: dentry for link
- *
- */
-static int
-v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
-                struct dentry *dentry)
-{
-        int err;
-        struct p9_fid *dfid, *oldfid;
-        char *name;
-        struct v9fs_session_info *v9ses;
-        struct dentry *dir_dentry;
-        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
-                        dir->i_ino, old_dentry->d_name.name,
-                        dentry->d_name.name);
-        v9ses = v9fs_inode2v9ses(dir);
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid))
-                return PTR_ERR(dfid);
-        oldfid = v9fs_fid_lookup(old_dentry);
-        if (IS_ERR(oldfid))
-                return PTR_ERR(oldfid);
-        name = (char *) dentry->d_name.name;
-        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
-                return err;
-        }
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                /* Get the latest stat info from server. */
-                struct p9_fid *fid;
-                struct p9_stat_dotl *st;
-                fid = v9fs_fid_lookup(old_dentry);
-                if (IS_ERR(fid))
-                        return PTR_ERR(fid);
-                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-                if (IS_ERR(st))
-                        return PTR_ERR(st);
-                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-                kfree(st);
-        } else {
-                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just i_count++
-                 */
-                atomic_inc(&old_dentry->d_inode->i_count);
-        }
-        dentry->d_op = old_dentry->d_op;
-        d_instantiate(dentry, old_dentry->d_inode);
-        return err;
-}
-/**
 * v9fs_vfs_mknod - create a special file
 * @dir: inode destination for new link
 * @dentry: dentry for file
@@ -1845,100 +1237,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
-/**
- * v9fs_vfs_mknod_dotl - create a special file
- * @dir: inode destination for new link
- * @dentry: dentry for file
- * @mode: mode for creation
- * @rdev: device associated with special file
- *
- */
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
-                dev_t rdev)
-{
-        int err;
-        char *name;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL, *dfid = NULL;
-        struct inode *inode;
-        gid_t gid;
-        struct p9_qid qid;
-        struct dentry *dir_dentry;
-        P9_DPRINTK(P9_DEBUG_VFS,
-                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-                dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
-        if (!new_valid_dev(rdev))
-                return -EINVAL;
-        v9ses = v9fs_inode2v9ses(dir);
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
-                goto error;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        if (gid < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
-                goto error;
-        }
-        name = (char *) dentry->d_name.name;
-        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
-        if (err < 0)
-                goto error;
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate inode with stat.
-                 * socket syscall returns a fd, so we need instantiate
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_dentry_operations;
-                d_instantiate(dentry, inode);
-        }
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -1953,25 +1251,6 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_dir_inode_operations_dotl = {
-        .create = v9fs_vfs_create_dotl,
-        .lookup = v9fs_vfs_lookup,
-        .link = v9fs_vfs_link_dotl,
-        .symlink = v9fs_vfs_symlink_dotl,
-        .unlink = v9fs_vfs_unlink,
-        .mkdir = v9fs_vfs_mkdir_dotl,
-        .rmdir = v9fs_vfs_rmdir,
-        .mknod = v9fs_vfs_mknod_dotl,
-        .rename = v9fs_vfs_rename,
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-};
 static const struct inode_operations v9fs_dir_inode_operations = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -1989,15 +1268,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_file_inode_operations_dotl = {
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-};
 static const struct inode_operations v9fs_symlink_inode_operations = {
        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
@@ -2006,14 +1276,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-        .readlink = generic_readlink,
-        .follow_link = v9fs_vfs_follow_link,
-        .put_link = v9fs_vfs_put_link,
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 000000000000..fe3ffa9aace4
--- /dev/null
+++ b/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,824 @@
+/*
+ *  linux/fs/9p/vfs_inode_dotl.c
+ *
+ * This file contains vfs inode ops for the 9P2000.L protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+#include "xattr.h"
+#include "acl.h"
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                    dev_t rdev);
+/**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+        BUG_ON(dir_inode == NULL);
+        if (dir_inode->i_mode & S_ISGID) {
+                /* set_gid bit is set.*/
+                return dir_inode->i_gid;
+        }
+        return current_fsgid();
+}
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+        struct dentry *dentry;
+        spin_lock(&inode->i_lock);
+        /* Directory should have only one entry. */
+        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+        spin_unlock(&inode->i_lock);
+        return dentry;
+}
+struct inode *
+v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+        struct super_block *sb)
+{
+        struct inode *ret = NULL;
+        int err;
+        struct p9_stat_dotl *st;
+        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        ret = v9fs_get_inode(sb, st->st_mode);
+        if (IS_ERR(ret)) {
+                err = PTR_ERR(ret);
+                goto error;
+        }
+        v9fs_stat2inode_dotl(st, ret);
+        ret->i_ino = v9fs_qid2ino(&st->qid);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_cache_inode_get_cookie(ret);
+#endif
+        err = v9fs_get_acl(ret, fid);
+        if (err) {
+                iput(ret);
+                goto error;
+        }
+        kfree(st);
+        return ret;
+error:
+        kfree(st);
+        return ERR_PTR(err);
+}
+/**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                struct nameidata *nd)
+{
+        int err = 0;
+        char *name = NULL;
+        gid_t gid;
+        int flags;
+        mode_t mode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL;
+        struct p9_fid *dfid, *ofid;
+        struct file *filp;
+        struct p9_qid qid;
+        struct inode *inode;
+        struct posix_acl *pacl = NULL, *dacl = NULL;
+        v9ses = v9fs_inode2v9ses(dir);
+        if (nd && nd->flags & LOOKUP_OPEN)
+                flags = nd->intent.open.flags - 1;
+        else {
+                /*
+                 * create call without LOOKUP_OPEN is due
+                 * to mknod of regular files. So use mknod
+                 * operation.
+                 */
+                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+        }
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+                        "mode:0x%x\n", name, flags, omode);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        /* clone a fid to use for creation */
+        ofid = p9_client_walk(dfid, 0, NULL, 1);
+        if (IS_ERR(ofid)) {
+                err = PTR_ERR(ofid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in creat %d\n", err);
+                goto error;
+        }
+        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                                "p9_client_open_dotl failed in creat %d\n",
+                                err);
+                goto error;
+        }
+        /* instantiate inode and assign the unopened fid to the dentry */
+        fid = p9_client_walk(dfid, 1, &name, 1);
+        if (IS_ERR(fid)) {
+                err = PTR_ERR(fid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                fid = NULL;
+                goto error;
+        }
+        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+                goto error;
+        }
+        d_instantiate(dentry, inode);
+        err = v9fs_fid_add(dentry, fid);
+        if (err < 0)
+                goto error;
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+        /* Since we are opening a file, assign the open fid to the file */
+        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
+        if (IS_ERR(filp)) {
+                p9_client_clunk(ofid);
+                return PTR_ERR(filp);
+        }
+        filp->private_data = ofid;
+        return 0;
+error:
+        if (ofid)
+                p9_client_clunk(ofid);
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
+                               struct dentry *dentry, int omode)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        gid_t gid;
+        char *name;
+        mode_t mode;
+        struct inode *inode;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+        err = 0;
+        v9ses = v9fs_inode2v9ses(dir);
+        omode |= S_IFDIR;
+        if (dir->i_mode & S_ISGID)
+                omode |= S_ISGID;
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mkdir %d\n", err);
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate
+                 * inode with stat. We need to get an inode
+                 * so that we can set the acl with dentry
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_stat_dotl *st;
+        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+        err = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+                return simple_getattr(mnt, dentry, stat);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        /* Ask for all the fields in stat structure. Server will return
+         * whatever it supports
+         */
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        v9fs_stat2inode_dotl(st, dentry->d_inode);
+        generic_fillattr(dentry->d_inode, stat);
+        /* Change block size to what the server returned */
+        stat->blksize = st->st_blksize;
+        kfree(st);
+        return 0;
+}
+/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+        int retval;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_iattr_dotl p9attr;
+        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        retval = inode_change_ok(dentry->d_inode, iattr);
+        if (retval)
+                return retval;
+        p9attr.valid = iattr->ia_valid;
+        p9attr.mode = iattr->ia_mode;
+        p9attr.uid = iattr->ia_uid;
+        p9attr.gid = iattr->ia_gid;
+        p9attr.size = iattr->ia_size;
+        p9attr.atime_sec = iattr->ia_atime.tv_sec;
+        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+        retval = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        retval = p9_client_setattr(fid, &p9attr);
+        if (retval < 0)
+                return retval;
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(dentry->d_inode)) {
+                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                if (retval)
+                        return retval;
+        }
+        setattr_copy(dentry->d_inode, iattr);
+        mark_inode_dirty(dentry->d_inode);
+        if (iattr->ia_valid & ATTR_MODE) {
+                /* We also want to update ACL when we update mode bits */
+                retval = v9fs_acl_chmod(dentry);
+                if (retval < 0)
+                        return retval;
+        }
+        return 0;
+}
+/**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+                inode->i_atime.tv_sec = stat->st_atime_sec;
+                inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                inode->i_uid = stat->st_uid;
+                inode->i_gid = stat->st_gid;
+                inode->i_nlink = stat->st_nlink;
+                inode->i_mode = stat->st_mode;
+                inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                i_size_write(inode, stat->st_size);
+                inode->i_blocks = stat->st_blocks;
+        } else {
+                if (stat->st_result_mask & P9_STATS_ATIME) {
+                        inode->i_atime.tv_sec = stat->st_atime_sec;
+                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_MTIME) {
+                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_CTIME) {
+                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_UID)
+                        inode->i_uid = stat->st_uid;
+                if (stat->st_result_mask & P9_STATS_GID)
+                        inode->i_gid = stat->st_gid;
+                if (stat->st_result_mask & P9_STATS_NLINK)
+                        inode->i_nlink = stat->st_nlink;
+                if (stat->st_result_mask & P9_STATS_MODE) {
+                        inode->i_mode = stat->st_mode;
+                        if ((S_ISBLK(inode->i_mode)) ||
+                                                (S_ISCHR(inode->i_mode)))
+                                init_special_inode(inode, inode->i_mode,
+                                                                inode->i_rdev);
+                }
+                if (stat->st_result_mask & P9_STATS_RDEV)
+                        inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if (stat->st_result_mask & P9_STATS_SIZE)
+                        i_size_write(inode, stat->st_size);
+                if (stat->st_result_mask & P9_STATS_BLOCKS)
+                        inode->i_blocks = stat->st_blocks;
+        }
+        if (stat->st_result_mask & P9_STATS_GEN)
+                        inode->i_generation = stat->st_gen;
+        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+         * because the inode structure does not have fields for them.
+         */
+}
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+                const char *symname)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct inode *inode;
+        struct p9_qid qid;
+        char *name;
+        int err;
+        gid_t gid;
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+                        dir->i_ino, name, symname);
+        v9ses = v9fs_inode2v9ses(dir);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+                goto error;
+        }
+        if (v9ses->cache) {
+                /* Now walk from the parent so we can get an unopened fid. */
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                        err);
+                        fid = NULL;
+                        goto error;
+                }
+                /* instantiate inode and assign the unopened fid to dentry */
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                        err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /* Not in cached mode. No need to populate inode with stat */
+                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        int err;
+        struct p9_fid *dfid, *oldfid;
+        char *name;
+        struct v9fs_session_info *v9ses;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+                        dir->i_ino, old_dentry->d_name.name,
+                        dentry->d_name.name);
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid))
+                return PTR_ERR(dfid);
+        oldfid = v9fs_fid_lookup(old_dentry);
+        if (IS_ERR(oldfid))
+                return PTR_ERR(oldfid);
+        name = (char *) dentry->d_name.name;
+        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+                return err;
+        }
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                /* Get the latest stat info from server. */
+                struct p9_fid *fid;
+                struct p9_stat_dotl *st;
+                fid = v9fs_fid_lookup(old_dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                if (IS_ERR(st))
+                        return PTR_ERR(st);
+                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
+                kfree(st);
+        } else {
+                /* Caching disabled. No need to get upto date stat info.
+                 * This dentry will be released immediately. So, just hold the
+                 * inode
+                 */
+                ihold(old_dentry->d_inode);
+        }
+        d_instantiate(dentry, old_dentry->d_inode);
+        return err;
+}
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                dev_t rdev)
+{
+        int err;
+        char *name;
+        mode_t mode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        struct inode *inode;
+        gid_t gid;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS,
+                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mknod %d\n", err);
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate inode with stat.
+                 * socket syscall returns a fd, so we need instantiate
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+        int retval;
+        struct p9_fid *fid;
+        char *link = __getname();
+        char *target;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+        if (!link) {
+                link = ERR_PTR(-ENOMEM);
+                goto ndset;
+        }
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid)) {
+                __putname(link);
+                link = ERR_PTR(PTR_ERR(fid));
+                goto ndset;
+        }
+        retval = p9_client_readlink(fid, &target);
+        if (!retval) {
+                strcpy(link, target);
+                kfree(target);
+                goto ndset;
+        }
+        __putname(link);
+        link = ERR_PTR(retval);
+ndset:
+        nd_set_link(nd, link);
+        return NULL;
+}
+const struct inode_operations v9fs_dir_inode_operations_dotl = {
+        .create = v9fs_vfs_create_dotl,
+        .lookup = v9fs_vfs_lookup,
+        .link = v9fs_vfs_link_dotl,
+        .symlink = v9fs_vfs_symlink_dotl,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir_dotl,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod_dotl,
+        .rename = v9fs_vfs_rename,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
+};
+const struct inode_operations v9fs_file_inode_operations_dotl = {
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
+};
+const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+        .readlink = generic_readlink,
+        .follow_link = v9fs_vfs_follow_link_dotl,
+        .put_link = v9fs_vfs_put_link,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 1d12ba0ed3db..dbaabe3b8131 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -39,6 +39,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
+#include <linux/magic.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -46,6 +47,7 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 #include "xattr.h"
+#include "acl.h"
 static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
@@ -66,7 +68,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
 * v9fs_fill_super - populate superblock with info
 * @sb: superblock
 * @v9ses: session information
- * @flags: flags propagated from v9fs_get_sb()
+ * @flags: flags propagated from v9fs_mount()
 *
 */
@@ -88,22 +90,25 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
            MS_NOATIME;
+#ifdef CONFIG_9P_FS_POSIX_ACL
+        if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+                sb->s_flags |= MS_POSIXACL;
+#endif
        save_mount_options(sb, data);
 }
 /**
- * v9fs_get_sb - mount a superblock
+ * v9fs_mount - mount a superblock
 * @fs_type: file system type
 * @flags: mount flags
 * @dev_name: device name that was mounted
 * @data: mount options
- * @mnt: mountpoint record to be instantiated
 *
 */
-static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data,
+                       const char *dev_name, void *data)
-                       struct vfsmount *mnt)
 {
        struct super_block *sb = NULL;
        struct inode *inode = NULL;
@@ -117,7 +122,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
        if (!v9ses)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        fid = v9fs_session_init(v9ses, dev_name, data);
        if (IS_ERR(fid)) {
@@ -136,6 +141,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        }
        v9fs_fill_super(sb, v9ses, flags, data);
+        if (v9ses->cache)
+                sb->s_d_op = &v9fs_cached_dentry_operations;
+        else
+                sb->s_d_op = &v9fs_dentry_operations;
        inode = v9fs_get_inode(sb, S_IFDIR | mode);
        if (IS_ERR(inode)) {
                retval = PTR_ERR(inode);
@@ -149,7 +159,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                goto release_sb;
        }
        sb->s_root = root;
        if (v9fs_proto_dotl(v9ses)) {
                struct p9_stat_dotl *st = NULL;
                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
@@ -174,19 +183,21 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                p9stat_free(st);
                kfree(st);
        }
+        retval = v9fs_get_acl(inode, fid);
+        if (retval)
+                goto release_sb;
        v9fs_fid_add(root, fid);
        P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 clunk_fid:
        p9_client_clunk(fid);
 close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
-        return retval;
+        return ERR_PTR(retval);
 release_sb:
        /*
         * we will do the session_close and root dentry release
@@ -196,7 +207,7 @@ release_sb:
         */
        p9_client_clunk(fid);
        deactivate_locked_super(sb);
-        return retval;
+        return ERR_PTR(retval);
 }
 /**
@@ -211,9 +222,6 @@ static void v9fs_kill_super(struct super_block *s)
        P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
-        if (s->s_root)
-                v9fs_dentry_release(s->s_root); /* clunk root */
        kill_anon_super(s);
        v9fs_session_cancel(v9ses);
@@ -249,7 +257,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
        if (v9fs_proto_dotl(v9ses)) {
                res = p9_client_statfs(fid, &rs);
                if (res == 0) {
-                        buf->f_type = rs.type;
+                        buf->f_type = V9FS_MAGIC;
                        buf->f_bsize = rs.bsize;
                        buf->f_blocks = rs.blocks;
                        buf->f_bfree = rs.bfree;
@@ -292,7 +300,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
 struct file_system_type v9fs_fs_type = {
        .name = "9p",
-        .get_sb = v9fs_get_sb,
+        .mount = v9fs_mount,
        .kill_sb = v9fs_kill_super,
        .owner = THIS_MODULE,
        .fs_flags = FS_RENAME_DOES_D_MOVE,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index f88e5c2dc873..d288773871b3 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -21,30 +21,13 @@
 #include "fid.h"
 #include "xattr.h"
-/*
+ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
- * v9fs_xattr_get()
+                           void *buffer, size_t buffer_size)
- *
- * Copy an extended attribute into the buffer
- * provided, or compute the buffer size required.
- * Buffer is NULL to compute the size of the buffer required.
- *
- * Returns a negative error number on failure, or the number of bytes
- * used / required on success.
- */
-ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t buffer_size)
 {
        ssize_t retval;
        int msize, read_count;
        u64 offset = 0, attr_size;
-        struct p9_fid *fid, *attr_fid;
+        struct p9_fid *attr_fid;
-        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
-                __func__, name, buffer_size);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
        attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
        if (IS_ERR(attr_fid)) {
@@ -88,6 +71,31 @@ error:
 }
+/*
+ * v9fs_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+                       void *buffer, size_t buffer_size)
+{
+        struct p9_fid *fid;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
+                __func__, name, buffer_size);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        return v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
+}
 /*
 * v9fs_xattr_set()
 *
@@ -125,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
                        "p9_client_xattrcreate failed %d\n", retval);
                goto error;
        }
-        msize = fid->clnt->msize;;
+        msize = fid->clnt->msize;
        while (value_len) {
                if (value_len > (msize - P9_IOHDRSZ))
                        write_count = msize - P9_IOHDRSZ;
@@ -156,5 +164,9 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 const struct xattr_handler *v9fs_xattr_handlers[] = {
        &v9fs_xattr_user_handler,
+#ifdef CONFIG_9P_FS_POSIX_ACL
+        &v9fs_xattr_acl_access_handler,
+        &v9fs_xattr_acl_default_handler,
+#endif
        NULL
 };
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index 9ddf672ae5c4..eaa837c53bd5 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -15,10 +15,16 @@
 #define FS_9P_XATTR_H
 #include <linux/xattr.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
 extern const struct xattr_handler *v9fs_xattr_handlers[];
 extern struct xattr_handler v9fs_xattr_user_handler;
+extern const struct xattr_handler v9fs_xattr_acl_access_handler;
+extern const struct xattr_handler v9fs_xattr_acl_default_handler;
+extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
+                                  void *, size_t);
 extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
                              void *, size_t);
 extern int v9fs_xattr_set(struct dentry *, const char *,
diff --git a/fs/Kconfig b/fs/Kconfig
index 3d185308ec88..3db9caa57edc 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -30,15 +30,6 @@ config FS_MBCACHE
 source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
-config FS_POSIX_ACL
-# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
-#
-# NOTE: you can implement Posix ACLs without these helpers (XFS does).
-#       Never use this symbol for ifdefs.
-#
-        bool
-        default n
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
@@ -47,8 +38,19 @@ source "fs/nilfs2/Kconfig"
 endif # BLOCK
+# Posix ACL utility routines
+#
+# Note: Posix ACLs can be implemented without these helpers.  Never use
+# this symbol for ifdefs in core code.
+#
+config FS_POSIX_ACL
+        def_bool n
+config EXPORTFS
+        tristate
 config FILE_LOCKING
-        bool "Enable POSIX file locking API" if EMBEDDED
+        bool "Enable POSIX file locking API" if EXPERT
        default y
        help
          This option enables standard file locking support, required
@@ -59,7 +61,6 @@ source "fs/notify/Kconfig"
 source "fs/quota/Kconfig"
-source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
@@ -221,9 +222,6 @@ config LOCKD_V4
        depends on FILE_LOCKING
        default y
-config EXPORTFS
-        tristate
 config NFS_ACL_SUPPORT
        tristate
        select FS_POSIX_ACL
@@ -234,7 +232,6 @@ config NFS_COMMON
        default y
 source "net/sunrpc/Kconfig"
-source "fs/smbfs/Kconfig"
 source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bb4cc5b8abc8..79e2ca7973b7 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
 config CORE_DUMP_DEFAULT_ELF_HEADERS
        bool "Write ELF core dumps with partial segments"
-        default n
+        default y
        depends on BINFMT_ELF && ELF_CORE
        help
          ELF core dump files describe each memory mapping of the crashed
@@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
          inherited.  See Documentation/filesystems/proc.txt for details.
          This config option changes the default setting of coredump_filter
-          seen at boot time.  If unsure, say N.
+          seen at boot time.  If unsure, say Y.
 config BINFMT_FLAT
        bool "Kernel support for flat binaries"
diff --git a/fs/Makefile b/fs/Makefile
index e6ec1d309b1d..a7f7cef0c0c8 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,10 +29,7 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)            += compat.o compat_ioctl.o
+obj-$(CONFIG_NFSD_DEPRECATED)   += nfsctl.o
-nfsd-$(CONFIG_NFSD)             := nfsctl.o
-obj-y                           += $(nfsd-y) $(nfsd-m)
 obj-$(CONFIG_BINFMT_AOUT)       += binfmt_aout.o
 obj-$(CONFIG_BINFMT_EM86)       += binfmt_em86.o
 obj-$(CONFIG_BINFMT_MISC)       += binfmt_misc.o
@@ -91,7 +88,6 @@ obj-$(CONFIG_NFSD)		+= nfsd/
 obj-$(CONFIG_LOCKD)             += lockd/
 obj-$(CONFIG_NLS)               += nls/
 obj-$(CONFIG_SYSV_FS)           += sysv/
-obj-$(CONFIG_SMB_FS)            += smbfs/
 obj-$(CONFIG_CIFS)              += cifs/
 obj-$(CONFIG_NCP_FS)            += ncpfs/
 obj-$(CONFIG_HPFS_FS)           += hpfs/
@@ -104,7 +100,6 @@ obj-$(CONFIG_UBIFS_FS)		+= ubifs/
 obj-$(CONFIG_AFFS_FS)           += affs/
 obj-$(CONFIG_ROMFS_FS)          += romfs/
 obj-$(CONFIG_QNX4FS_FS)         += qnx4/
-obj-$(CONFIG_AUTOFS_FS)         += autofs/
 obj-$(CONFIG_AUTOFS4_FS)        += autofs4/
 obj-$(CONFIG_ADFS_FS)           += adfs/
 obj-$(CONFIG_FUSE_FS)           += fuse/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a74605..1dd5f34b3cf2 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,7 @@
 config ADFS_FS
        tristate "ADFS file system support (EXPERIMENTAL)"
        depends on BLOCK && EXPERIMENTAL
+        depends on BKL # need to fix
        help
          The Acorn Disc Filing System is the standard file system of the
          RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index f4287e4de744..3b4a764ed780 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -201,7 +201,8 @@ const struct file_operations adfs_dir_operations = {
 };
 static int
-adfs_hash(struct dentry *parent, struct qstr *qstr)
+adfs_hash(const struct dentry *parent, const struct inode *inode,
+                struct qstr *qstr)
 {
        const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
        const unsigned char *name;
@@ -237,17 +238,19 @@ adfs_hash(struct dentry *parent, struct qstr *qstr)
 * requirements of the underlying filesystem.
 */
 static int
-adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name)
+adfs_compare(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        int i;
-        if (entry->len != name->len)
+        if (len != name->len)
                return 1;
        for (i = 0; i < name->len; i++) {
                char a, b;
-                a = entry->name[i];
+                a = str[i];
                b = name->name[i];
                if (a >= 'A' && a <= 'Z')
@@ -273,7 +276,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct object_info obj;
        int error;
-        dentry->d_op = &adfs_dentry_operations; 
        lock_kernel();
        error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
        if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4a3af7075c1d..2d7954049fbe 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -240,11 +240,18 @@ static struct inode *adfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void adfs_destroy_inode(struct inode *inode)
+static void adfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
+static void adfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, adfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
@@ -352,11 +359,15 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
        struct adfs_sb_info *asb;
        struct inode *root;
+        lock_kernel();
        sb->s_flags |= MS_NODIRATIME;
        asb = kzalloc(sizeof(*asb), GFP_KERNEL);
-        if (!asb)
+        if (!asb) {
+                unlock_kernel();
                return -ENOMEM;
+        }
        sb->s_fs_info = asb;
        /* set default options */
@@ -462,6 +473,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                asb->s_namelen = ADFS_F_NAME_LEN;
        }
+        sb->s_d_op = &adfs_dentry_operations;
        root = adfs_iget(sb, &root_obj);
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
@@ -472,8 +484,8 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                kfree(asb->s_map);
                adfs_error(sb, "get root inode failed\n");
                goto error;
-        } else
+        }
-                sb->s_root->d_op = &adfs_dentry_operations;
+        unlock_kernel();
        return 0;
 error_free_bh:
@@ -481,20 +493,20 @@ error_free_bh:
 error:
        sb->s_fs_info = NULL;
        kfree(asb);
+        unlock_kernel();
        return -EINVAL;
 }
-static int adfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *adfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
-                           mnt);
 }
 static struct file_system_type adfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "adfs",
-        .get_sb         = adfs_get_sb,
+        .mount          = adfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a8cbdeb34025..0e95f73a7023 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -201,6 +201,7 @@ extern const struct address_space_operations	 affs_aops;
 extern const struct address_space_operations     affs_aops_ofs;
 extern const struct dentry_operations    affs_dentry_operations;
+extern const struct dentry_operations    affs_intl_dentry_operations;
 static inline void
 affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7d0f0a30f7a3..3a4557e8325c 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -128,7 +128,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
        void *data = dentry->d_fsdata;
        struct list_head *head, *next;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        head = &inode->i_dentry;
        next = head->next;
        while (next != head) {
@@ -139,7 +139,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
                }
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
diff --git a/fs/affs/file.c b/fs/affs/file.c
index c4a9875bd1a6..0a90dcd46de2 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -894,9 +894,9 @@ affs_truncate(struct inode *inode)
                if (AFFS_SB(sb)->s_flags & SF_OFS) {
                        struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
                        u32 tmp;
-                        if (IS_ERR(ext_bh)) {
+                        if (IS_ERR(bh)) {
                                affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
-                                             ext, PTR_ERR(ext_bh));
+                                             ext, PTR_ERR(bh));
                                return;
                        }
                        tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3a0fdec175ba..5d828903ac69 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -388,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
                affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
                mark_buffer_dirty_inode(inode_bh, inode);
                inode->i_nlink = 2;
-                atomic_inc(&inode->i_count);
+                ihold(inode);
        }
        affs_fix_checksum(sb, bh);
        mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 914d1c0bc07a..e3e9efc1fdd8 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,26 @@
 typedef int (*toupper_t)(int);
 static int       affs_toupper(int ch);
-static int       affs_hash_dentry(struct dentry *, struct qstr *);
+static int       affs_hash_dentry(const struct dentry *,
-static int       affs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+                const struct inode *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 static int       affs_intl_toupper(int ch);
-static int       affs_intl_hash_dentry(struct dentry *, struct qstr *);
+static int       affs_intl_hash_dentry(const struct dentry *,
-static int       affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+                const struct inode *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 const struct dentry_operations affs_dentry_operations = {
        .d_hash         = affs_hash_dentry,
        .d_compare      = affs_compare_dentry,
 };
-static const struct dentry_operations affs_intl_dentry_operations = {
+const struct dentry_operations affs_intl_dentry_operations = {
        .d_hash         = affs_intl_hash_dentry,
        .d_compare      = affs_intl_compare_dentry,
 };
@@ -58,13 +66,13 @@ affs_get_toupper(struct super_block *sb)
 * Note: the dentry argument is the parent dentry.
 */
 static inline int
-__affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 {
        const u8 *name = qstr->name;
        unsigned long hash;
        int i;
-        i = affs_check_name(qstr->name,qstr->len);
+        i = affs_check_name(qstr->name, qstr->len);
        if (i)
                return i;
@@ -78,39 +86,41 @@ __affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
 }
 static int
-affs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        return __affs_hash_dentry(dentry, qstr, affs_toupper);
+        return __affs_hash_dentry(qstr, affs_toupper);
 }
 static int
-affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        return __affs_hash_dentry(dentry, qstr, affs_intl_toupper);
+        return __affs_hash_dentry(qstr, affs_intl_toupper);
 }
-static inline int
+static inline int __affs_compare_dentry(unsigned int len,
-__affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, toupper_t toupper)
+                const char *str, const struct qstr *name, toupper_t toupper)
 {
-        const u8 *aname = a->name;
+        const u8 *aname = str;
-        const u8 *bname = b->name;
+        const u8 *bname = name->name;
-        int len;
-        /* 'a' is the qstr of an already existing dentry, so the name
+        /*
-         * must be valid. 'b' must be validated first.
+         * 'str' is the name of an already existing dentry, so the name
+         * must be valid. 'name' must be validated first.
         */
-        if (affs_check_name(b->name,b->len))
+        if (affs_check_name(name->name, name->len))
                return 1;
-        /* If the names are longer than the allowed 30 chars,
+        /*
+         * If the names are longer than the allowed 30 chars,
         * the excess is ignored, so their length may differ.
         */
-        len = a->len;
        if (len >= 30) {
-                if (b->len < 30)
+                if (name->len < 30)
                        return 1;
                len = 30;
-        } else if (len != b->len)
+        } else if (len != name->len)
                return 1;
        for (; len > 0; len--)
@@ -121,14 +131,18 @@ __affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, tou
 }
 static int
-affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return __affs_compare_dentry(dentry, a, b, affs_toupper);
+        return __affs_compare_dentry(len, str, name, affs_toupper);
 }
 static int
-affs_intl_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return __affs_compare_dentry(dentry, a, b, affs_intl_toupper);
+        return __affs_compare_dentry(len, str, name, affs_intl_toupper);
 }
 /*
@@ -226,7 +240,6 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
        }
-        dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
        d_add(dentry, inode);
        return NULL;
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 33c4e7eef470..b31507d0f9b9 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,7 +16,6 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "affs.h"
@@ -46,8 +45,6 @@ affs_put_super(struct super_block *sb)
        struct affs_sb_info *sbi = AFFS_SB(sb);
        pr_debug("AFFS: put_super()\n");
-        lock_kernel();
        if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
                affs_commit_super(sb, 1, 1);
@@ -56,8 +53,6 @@ affs_put_super(struct super_block *sb)
        affs_brelse(sbi->s_root_bh);
        kfree(sbi);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static void
@@ -100,17 +95,24 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
        return &i->vfs_inode;
 }
-static void affs_destroy_inode(struct inode *inode)
+static void affs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
+static void affs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, affs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct affs_inode_info *ei = (struct affs_inode_info *) foo;
-        init_MUTEX(&ei->i_link_lock);
+        sema_init(&ei->i_link_lock, 1);
-        init_MUTEX(&ei->i_ext_lock);
+        sema_init(&ei->i_ext_lock, 1);
        inode_init_once(&ei->vfs_inode);
 }
@@ -302,6 +304,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
        mutex_init(&sbi->s_bmlock);
        spin_lock_init(&sbi->symlink_lock);
@@ -474,12 +477,16 @@ got_root:
                goto out_error_noinode;
        }
+        if (AFFS_SB(sb)->s_flags & SF_INTL)
+                sb->s_d_op = &affs_intl_dentry_operations;
+        else
+                sb->s_d_op = &affs_dentry_operations;
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root) {
                printk(KERN_ERR "AFFS: Get root inode failed\n");
                goto out_error;
        }
-        sb->s_root->d_op = &affs_dentry_operations;
        pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
        return 0;
@@ -527,7 +534,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
                kfree(new_opts);
                return -EINVAL;
        }
-        lock_kernel();
        replace_mount_options(sb, new_opts);
        sbi->s_flags = mount_flags;
@@ -543,17 +550,15 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        memcpy(sbi->s_volume, volume, 32);
        spin_unlock(&sbi->symlink_lock);
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
-                unlock_kernel();
                return 0;
-        }
        if (*flags & MS_RDONLY) {
                affs_write_super(sb);
                affs_free_bitmap(sb);
        } else
                res = affs_init_bitmap(sb, flags);
-        unlock_kernel();
        return res;
 }
@@ -579,17 +584,16 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int affs_get_sb(struct file_system_type *fs_type,
+static struct dentry *affs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
-                           mnt);
 }
 static struct file_system_type affs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "affs",
-        .get_sb         = affs_get_sb,
+        .mount          = affs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3bcec75c54a..1c8c6cc6de30 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
        call->server = server;
        INIT_WORK(&call->work, SRXAFSCB_CallBack);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
        call->server = server;
        INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
        call->server = server;
        INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
        call->state = AFS_CALL_REPLYING;
        INIT_WORK(&call->work, SRXAFSCB_Probe);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
        call->state = AFS_CALL_REPLYING;
        INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
        call->state = AFS_CALL_REPLYING;
        INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0d38c09bd55e..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
@@ -23,7 +24,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 static int afs_dir_open(struct inode *inode, struct file *file);
 static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
-static int afs_d_delete(struct dentry *dentry);
+static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
 static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
                                  loff_t fpos, u64 ino, unsigned dtype);
@@ -61,10 +62,11 @@ const struct inode_operations afs_dir_inode_operations = {
        .setattr        = afs_setattr,
 };
-static const struct dentry_operations afs_fs_dentry_operations = {
+const struct dentry_operations afs_fs_dentry_operations = {
        .d_revalidate   = afs_d_revalidate,
        .d_delete       = afs_d_delete,
        .d_release      = afs_d_release,
+        .d_automount    = afs_d_automount,
 };
 #define AFS_DIR_HASHTBL_SIZE    128
@@ -581,8 +583,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
        }
 success:
-        dentry->d_op = &afs_fs_dentry_operations;
        d_add(dentry, inode);
        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
               fid.vnode,
@@ -607,6 +607,9 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
        void *dir_version;
        int ret;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        vnode = AFS_FS_I(dentry->d_inode);
        if (dentry->d_inode)
@@ -730,7 +733,7 @@ out_bad:
 * - called from dput() when d_count is going to 0.
 * - return 1 to request dentry be unhashed, 0 otherwise
 */
-static int afs_d_delete(struct dentry *dentry)
+static int afs_d_delete(const struct dentry *dentry)
 {
        _enter("%s", dentry->d_name.name);
@@ -1045,7 +1048,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
        if (ret < 0)
                goto link_error;
-        atomic_inc(&vnode->vfs_inode.i_count);
+        ihold(&vnode->vfs_inode);
        d_instantiate(dentry, &vnode->vfs_inode);
        key_put(key);
        _leave(" = 0");
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 0931bc1325eb..757d664575dd 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -9,7 +9,6 @@
 * 2 of the License, or (at your option) any later version.
 */
-#include <linux/smp_lock.h>
 #include "internal.h"
 #define AFS_LOCK_GRANTED        0
@@ -274,7 +273,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
        type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
-        lock_kernel();
+        lock_flocks();
        /* make sure we've got a callback on this file and that our view of the
         * data version is up to date */
@@ -421,7 +420,7 @@ given_lock:
        afs_vnode_fetch_status(vnode, NULL, key);
 error:
-        unlock_kernel();
+        unlock_flocks();
        _leave(" = %d", ret);
        return ret;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0747339011c3..db66c5201474 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
        inode->i_generation     = 0;
        set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
-        inode->i_flags |= S_NOATIME;
+        set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+        inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
        unlock_new_inode(inode);
        _leave(" = %p", inode);
        return inode;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index cca8eef736fc..5a9b6843bac1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -486,6 +486,7 @@ extern bool afs_cm_incoming_call(struct afs_call *);
 * dir.c
 */
 extern const struct inode_operations afs_dir_inode_operations;
+extern const struct dentry_operations afs_fs_dentry_operations;
 extern const struct file_operations afs_dir_file_operations;
 /*
@@ -576,6 +577,7 @@ extern int afs_drop_inode(struct inode *);
 /*
 * main.c
 */
+extern struct workqueue_struct *afs_wq;
 extern struct afs_uuid afs_uuid;
 /*
@@ -590,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations;
 extern const struct inode_operations afs_autocell_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
+extern struct vfsmount *afs_d_automount(struct path *);
 extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
 extern void afs_mntpt_kill_timer(void);
@@ -624,7 +627,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int);
+extern int afs_permission(struct inode *, int, unsigned int);
 /*
 * server.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cfd1cbe25b22..42dd2e499ed8 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -30,6 +30,7 @@ module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 struct afs_uuid afs_uuid;
+struct workqueue_struct *afs_wq;
 /*
 * get a client UUID
@@ -87,10 +88,16 @@ static int __init afs_init(void)
        if (ret < 0)
                return ret;
+        /* create workqueue */
+        ret = -ENOMEM;
+        afs_wq = alloc_workqueue("afs", 0, 0);
+        if (!afs_wq)
+                return ret;
        /* register the /proc stuff */
        ret = afs_proc_init();
        if (ret < 0)
-                return ret;
+                goto error_proc;
 #ifdef CONFIG_AFS_FSCACHE
        /* we want to be able to cache */
@@ -140,6 +147,8 @@ error_cell_init:
 error_cache:
 #endif
        afs_proc_cleanup();
+error_proc:
+        destroy_workqueue(afs_wq);
        rcu_barrier();
        printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
        return ret;
@@ -163,7 +172,7 @@ static void __exit afs_exit(void)
        afs_purge_servers();
        afs_callback_update_kill();
        afs_vlocation_purge();
-        flush_scheduled_work();
+        destroy_workqueue(afs_wq);
        afs_cell_purge();
 #ifdef CONFIG_AFS_FSCACHE
        fscache_unregister_netfs(&afs_cache_netfs);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6d552686c498..aa59184151d0 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -24,22 +24,20 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
                                       struct dentry *dentry,
                                       struct nameidata *nd);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
 static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 const struct file_operations afs_mntpt_file_operations = {
        .open           = afs_mntpt_open,
+        .llseek         = noop_llseek,
 };
 const struct inode_operations afs_mntpt_inode_operations = {
        .lookup         = afs_mntpt_lookup,
-        .follow_link    = afs_mntpt_follow_link,
        .readlink       = page_readlink,
        .getattr        = afs_getattr,
 };
 const struct inode_operations afs_autocell_inode_operations = {
-        .follow_link    = afs_mntpt_follow_link,
        .getattr        = afs_getattr,
 };
@@ -87,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
                _debug("symlink is a mountpoint");
                spin_lock(&vnode->lock);
                set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+                vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
                spin_unlock(&vnode->lock);
        }
@@ -237,52 +236,24 @@ error_no_devname:
 }
 /*
- * follow a link from a mountpoint directory, thus causing it to be mounted
+ * handle an automount point
 */
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *afs_d_automount(struct path *path)
 {
        struct vfsmount *newmnt;
-        int err;
-        _enter("%p{%s},{%s:%p{%s},}",
+        _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
-               dentry,
-               dentry->d_name.name,
-               nd->path.mnt->mnt_devname,
-               dentry,
-               nd->path.dentry->d_name.name);
-        dput(nd->path.dentry);
-        nd->path.dentry = dget(dentry);
-        newmnt = afs_mntpt_do_automount(nd->path.dentry);
+        newmnt = afs_mntpt_do_automount(path->dentry);
-        if (IS_ERR(newmnt)) {
+        if (IS_ERR(newmnt))
-                path_put(&nd->path);
+                return newmnt;
-                return (void *)newmnt;
-        }
-        mntget(newmnt);
-        err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
-        switch (err) {
-        case 0:
-                path_put(&nd->path);
-                nd->path.mnt = newmnt;
-                nd->path.dentry = dget(newmnt->mnt_root);
-                schedule_delayed_work(&afs_mntpt_expiry_timer,
-                                      afs_mntpt_expiry_timeout * HZ);
-                break;
-        case -EBUSY:
-                /* someone else made a mount here whilst we were busy */
-                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path))
-                        ;
-                err = 0;
-        default:
-                mntput(newmnt);
-                break;
-        }
-        _leave(" = %d", err);
+        mntget(newmnt); /* prevent immediate expiration */
-        return ERR_PTR(err);
+        mnt_set_expiry(newmnt, &afs_vfsmounts);
+        queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+                           afs_mntpt_expiry_timeout * HZ);
+        _leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+        return newmnt;
 }
 /*
@@ -294,8 +265,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work)
        if (!list_empty(&afs_vfsmounts)) {
                mark_mounts_for_expiry(&afs_vfsmounts);
-                schedule_delayed_work(&afs_mntpt_expiry_timer,
+                queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
-                                      afs_mntpt_expiry_timeout * HZ);
+                                   afs_mntpt_expiry_timeout * HZ);
        }
        _leave("");
@@ -309,6 +280,5 @@ void afs_mntpt_kill_timer(void)
        _enter("");
        ASSERT(list_empty(&afs_vfsmounts));
-        cancel_delayed_work(&afs_mntpt_expiry_timer);
+        cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
-        flush_scheduled_work();
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 654d8fdbf01f..e45a323aebb4 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
        if (!call) {
                /* its an incoming call for our callback service */
                skb_queue_tail(&afs_incoming_calls, skb);
-                schedule_work(&afs_collect_incoming_call_work);
+                queue_work(afs_wq, &afs_collect_incoming_call_work);
        } else {
                /* route the messages directly to the appropriate call */
                skb_queue_tail(&call->rx_queue, skb);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index bb4ed144d0e4..f44b9d355377 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -285,13 +285,16 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 * - AFS ACLs are attached to directories only, and a file is controlled by its
 *   parent directory's ACL
 */
-int afs_permission(struct inode *inode, int mask)
+int afs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct afs_vnode *vnode = AFS_FS_I(inode);
        afs_access_t uninitialized_var(access);
        struct key *key;
        int ret;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        _enter("{{%x:%u},%lx},%x,",
               vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
@@ -347,7 +350,7 @@ int afs_permission(struct inode *inode, int mask)
        }
        key_put(key);
-        ret = generic_permission(inode, mask, NULL);
+        ret = generic_permission(inode, mask, flags, NULL);
        _leave(" = %d", ret);
        return ret;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 9fdc7fe3a7bc..d59b7516e943 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server)
        if (atomic_read(&server->usage) == 0) {
                list_move_tail(&server->grave, &afs_server_graveyard);
                server->time_of_death = get_seconds();
-                schedule_delayed_work(&afs_server_reaper,
+                queue_delayed_work(afs_wq, &afs_server_reaper,
-                                      afs_server_timeout * HZ);
+                                   afs_server_timeout * HZ);
        }
        spin_unlock(&afs_server_graveyard_lock);
        _leave(" [dead]");
@@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work)
                expiry = server->time_of_death + afs_server_timeout;
                if (expiry > now) {
                        delay = (expiry - now) * HZ;
-                        if (!schedule_delayed_work(&afs_server_reaper, delay)) {
+                        if (!queue_delayed_work(afs_wq, &afs_server_reaper,
+                                                delay)) {
                                cancel_delayed_work(&afs_server_reaper);
-                                schedule_delayed_work(&afs_server_reaper,
+                                queue_delayed_work(afs_wq, &afs_server_reaper,
-                                                      delay);
+                                                   delay);
                        }
                        break;
                }
@@ -323,5 +324,5 @@ void __exit afs_purge_servers(void)
 {
        afs_server_timeout = 0;
        cancel_delayed_work(&afs_server_reaper);
-        schedule_delayed_work(&afs_server_reaper, 0);
+        queue_delayed_work(afs_wq, &afs_server_reaper, 0);
 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 77e1e5a61154..fb240e8766d6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -19,7 +19,6 @@
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/parser.h>
@@ -30,9 +29,8 @@
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 static void afs_i_init_once(void *foo);
-static int afs_get_sb(struct file_system_type *fs_type,
+static struct dentry *afs_mount(struct file_system_type *fs_type,
-                      int flags, const char *dev_name,
+                      int flags, const char *dev_name, void *data);
-                      void *data, struct vfsmount *mnt);
 static struct inode *afs_alloc_inode(struct super_block *sb);
 static void afs_put_super(struct super_block *sb);
 static void afs_destroy_inode(struct inode *inode);
@@ -41,7 +39,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
 struct file_system_type afs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "afs",
-        .get_sb         = afs_get_sb,
+        .mount          = afs_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = 0,
 };
@@ -338,6 +336,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
        if (!root)
                goto error;
+        sb->s_d_op = &afs_fs_dentry_operations;
        sb->s_root = root;
        _leave(" = 0");
@@ -360,11 +359,8 @@ error:
 /*
 * get an AFS superblock
 */
-static int afs_get_sb(struct file_system_type *fs_type,
+static struct dentry *afs_mount(struct file_system_type *fs_type,
-                      int flags,
+                      int flags, const char *dev_name, void *options)
-                      const char *dev_name,
-                      void *options,
-                      struct vfsmount *mnt)
 {
        struct afs_mount_params params;
        struct super_block *sb;
@@ -428,12 +424,11 @@ static int afs_get_sb(struct file_system_type *fs_type,
                ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
        }
-        simple_set_mnt(mnt, sb);
        afs_put_volume(params.volume);
        afs_put_cell(params.cell);
        kfree(new_opts);
        _leave(" = 0 [%p]", sb);
-        return 0;
+        return dget(sb->s_root);
 error:
        afs_put_volume(params.volume);
@@ -441,7 +436,7 @@ error:
        key_put(params.key);
        kfree(new_opts);
        _leave(" = %d", ret);
-        return ret;
+        return ERR_PTR(ret);
 }
 /*
@@ -453,12 +448,8 @@ static void afs_put_super(struct super_block *sb)
        _enter("");
-        lock_kernel();
        afs_put_volume(as->volume);
-        unlock_kernel();
        _leave("");
 }
@@ -508,6 +499,14 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
        return &vnode->vfs_inode;
 }
+static void afs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct afs_vnode *vnode = AFS_FS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(afs_inode_cachep, vnode);
+}
 /*
 * destroy an AFS inode struct
 */
@@ -521,7 +520,7 @@ static void afs_destroy_inode(struct inode *inode)
        ASSERTCMP(vnode->server, ==, NULL);
-        kmem_cache_free(afs_inode_cachep, vnode);
+        call_rcu(&inode->i_rcu, afs_i_callback);
        atomic_dec(&afs_count_active_inodes);
 }
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 9ac260d1361d..431984d2e372 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl)
                _debug("buried");
                list_move_tail(&vl->grave, &afs_vlocation_graveyard);
                vl->time_of_death = get_seconds();
-                schedule_delayed_work(&afs_vlocation_reap,
+                queue_delayed_work(afs_wq, &afs_vlocation_reap,
-                                      afs_vlocation_timeout * HZ);
+                                   afs_vlocation_timeout * HZ);
                /* suspend updates on this record */
                if (!list_empty(&vl->update)) {
@@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work)
                if (expiry > now) {
                        delay = (expiry - now) * HZ;
                        _debug("delay %lu", delay);
-                        if (!schedule_delayed_work(&afs_vlocation_reap,
+                        if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
-                                                   delay)) {
+                                                delay)) {
                                cancel_delayed_work(&afs_vlocation_reap);
-                                schedule_delayed_work(&afs_vlocation_reap,
+                                queue_delayed_work(afs_wq, &afs_vlocation_reap,
-                                                      delay);
+                                                   delay);
                        }
                        break;
                }
@@ -620,7 +620,7 @@ void afs_vlocation_purge(void)
        destroy_workqueue(afs_vlocation_update_worker);
        cancel_delayed_work(&afs_vlocation_reap);
-        schedule_delayed_work(&afs_vlocation_reap, 0);
+        queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
 }
 /*
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 722743b152d8..15690bb1d3b5 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -438,7 +438,6 @@ no_more:
 */
 int afs_writepage(struct page *page, struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = page->mapping->backing_dev_info;
        struct afs_writeback *wb;
        int ret;
@@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
        }
        wbc->nr_to_write -= ret;
-        if (wbc->nonblocking && bdi_write_congested(bdi))
-                wbc->encountered_congestion = 1;
        _leave(" = 0");
        return 0;
@@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping,
                                 struct writeback_control *wbc,
                                 pgoff_t index, pgoff_t end, pgoff_t *_next)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        struct afs_writeback *wb;
        struct page *page;
        int ret, n;
@@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping,
                wbc->nr_to_write -= ret;
-                if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                        wbc->encountered_congestion = 1;
-                        break;
-                }
                cond_resched();
        } while (index < end && wbc->nr_to_write > 0);
@@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping,
 int afs_writepages(struct address_space *mapping,
                   struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        pgoff_t start, end, next;
        int ret;
        _enter("");
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                wbc->encountered_congestion = 1;
-                _leave(" = 0 [congest]");
-                return 0;
-        }
        if (wbc->range_cyclic) {
                start = mapping->writeback_index;
                end = -1;
                ret = afs_writepages_region(mapping, wbc, start, end, &next);
-                if (start > 0 && wbc->nr_to_write > 0 && ret == 0 &&
+                if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
-                    !(wbc->nonblocking && wbc->encountered_congestion))
                        ret = afs_writepages_region(mapping, wbc, 0, start,
                                                    &next);
                mapping->writeback_index = next;
diff --git a/fs/aio.c b/fs/aio.c
index 250b0a73c8a8..fc557a3be0a9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -87,7 +87,7 @@ static int __init aio_setup(void)
        aio_wq = create_workqueue("aio");
        abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
-        BUG_ON(!abe_pool);
+        BUG_ON(!aio_wq || !abe_pool);
        pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@@ -798,29 +798,12 @@ static void aio_queue_work(struct kioctx * ctx)
        queue_delayed_work(aio_wq, &ctx->wq, timeout);
 }
 /*
- * aio_run_iocbs:
+ * aio_run_all_iocbs:
- *      Process all pending retries queued on the ioctx
+ *      Process all pending retries queued on the ioctx
- *      run list.
+ *      run list, and keep running them until the list
- * Assumes it is operating within the aio issuer's mm
+ *      stays empty.
- * context.
+ * Assumes it is operating within the aio issuer's mm context.
- */
-static inline void aio_run_iocbs(struct kioctx *ctx)
-{
-        int requeue;
-        spin_lock_irq(&ctx->ctx_lock);
-        requeue = __aio_run_iocbs(ctx);
-        spin_unlock_irq(&ctx->ctx_lock);
-        if (requeue)
-                aio_queue_work(ctx);
-}
-/*
- * just like aio_run_iocbs, but keeps running them until
- * the list stays empty
 */
 static inline void aio_run_all_iocbs(struct kioctx *ctx)
 {
@@ -1543,7 +1526,19 @@ static void aio_batch_add(struct address_space *mapping,
        }
        abe = mempool_alloc(abe_pool, GFP_KERNEL);
-        BUG_ON(!igrab(mapping->host));
+        /*
+         * we should be using igrab here, but
+         * we don't want to hammer on the global
+         * inode spinlock just to take an extra
+         * reference on a file that we must already
+         * have a reference to.
+         *
+         * When we're called, we always have a reference
+         * on the file, so we must always have a reference
+         * on the inode, so ihold() is safe here.
+         */
+        ihold(mapping->host);
        abe->mapping = mapping;
        hlist_add_head(&abe->list, &batch_hash[bucket]);
        return;
@@ -1827,7 +1822,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
        long ret = -EINVAL;
        if (likely(ioctx)) {
-                if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0))
+                if (likely(min_nr <= nr && min_nr >= 0))
                        ret = read_events(ioctx, min_nr, nr, events, timeout);
                put_ioctx(ioctx);
        }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..c5567cb78432 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,14 +26,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
 static const struct file_operations anon_inode_fops;
-static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
-                               const char *dev_name, void *data,
-                               struct vfsmount *mnt)
-{
-        return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC,
-                             mnt);
-}
 /*
 * anon_inodefs_dname() is called from d_path().
 */
@@ -43,14 +35,22 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
                                dentry->d_name.name);
 }
+static const struct dentry_operations anon_inodefs_dentry_operations = {
+        .d_dname        = anon_inodefs_dname,
+};
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+                                int flags, const char *dev_name, void *data)
+{
+        return mount_pseudo(fs_type, "anon_inode:", NULL,
+                        &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+}
 static struct file_system_type anon_inode_fs_type = {
        .name           = "anon_inodefs",
-        .get_sb         = anon_inodefs_get_sb,
+        .mount          = anon_inodefs_mount,
        .kill_sb        = kill_anon_super,
 };
-static const struct dentry_operations anon_inodefs_dentry_operations = {
-        .d_dname        = anon_inodefs_dname,
-};
 /*
 * nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -66,9 +66,9 @@ static const struct address_space_operations anon_aops = {
 };
 /**
- * anon_inode_getfd - creates a new file instance by hooking it up to an
+ * anon_inode_getfile - creates a new file instance by hooking it up to an
- *                    anonymous inode, and a dentry that describe the "class"
+ *                      anonymous inode, and a dentry that describe the "class"
- *                    of the file
+ *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
@@ -104,19 +104,17 @@ struct file *anon_inode_getfile(const char *name,
        this.name = name;
        this.len = strlen(name);
        this.hash = 0;
-        path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
+        path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
        if (!path.dentry)
                goto err_module;
        path.mnt = mntget(anon_inode_mnt);
        /*
         * We know the anon_inode inode count is always greater than zero,
-         * so we can avoid doing an igrab() and we can use an open-coded
+         * so ihold() is safe.
-         * atomic_inc().
         */
-        atomic_inc(&anon_inode_inode->i_count);
+        ihold(anon_inode_inode);
-        path.dentry->d_op = &anon_inodefs_dentry_operations;
        d_instantiate(path.dentry, anon_inode_inode);
        error = -ENFILE;
@@ -194,6 +192,7 @@ static struct inode *anon_inode_mkinode(void)
        if (!inode)
                return ERR_PTR(-ENOMEM);
+        inode->i_ino = get_next_ino();
        inode->i_fop = &anon_inode_fops;
        inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
deleted file mode 100644
index 5f3bea90911e..000000000000
--- a/fs/autofs/Kconfig
+++ /dev/null
@@ -1,21 +0,0 @@
-config AUTOFS_FS
-        tristate "Kernel automounter support"
-        help
-          The automounter is a tool to automatically mount remote file systems
-          on demand. This implementation is partially kernel-based to reduce
-          overhead in the already-mounted case; this is unlike the BSD
-          automounter (amd), which is a pure user space daemon.
-          To use the automounter you need the user-space tools from the autofs
-          package; you can find the location in <file:Documentation/Changes>.
-          You also want to answer Y to "NFS file system support", below.
-          If you want to use the newer version of the automounter with more
-          features, say N here and say Y to "Kernel automounter v4 support",
-          below.
-          To compile this support as a module, choose M here: the module will be
-          called autofs.
-          If you are not a part of a fairly large, distributed network, you
-          probably do not need an automounter, and can say N here.
diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile
deleted file mode 100644
index 453a60f46d05..000000000000
--- a/fs/autofs/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux autofs-filesystem routines.
-#
-obj-$(CONFIG_AUTOFS_FS) += autofs.o
-autofs-objs := dirhash.o init.o inode.o root.o symlink.o waitq.o
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
deleted file mode 100644
index 901a3e67ec45..000000000000
--- a/fs/autofs/autofs_i.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *   
- * linux/fs/autofs/autofs_i.h
- *
- *   Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-/* Internal header file for autofs */
-#include <linux/auto_fs.h>
-/* This is the range of ioctl() numbers we claim as ours */
-#define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
-#define AUTOFS_IOC_COUNT     32
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/string.h>
-#include <linux/wait.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/sched.h>
-#include <asm/current.h>
-#include <asm/uaccess.h>
-#ifdef DEBUG
-#define DPRINTK(D) (printk D)
-#else
-#define DPRINTK(D) ((void)0)
-#endif
-/*
- * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the
- * kernel will keep the negative response cached for up to the time given
- * here, although the time can be shorter if the kernel throws the dcache
- * entry away.  This probably should be settable from user space.
- */
-#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ) /* 1 minute */
-/* Structures associated with the root directory hash table */
-#define AUTOFS_HASH_SIZE 67
-struct autofs_dir_ent {
-        int hash;
-        char *name;
-        int len;
-        ino_t ino;
-        struct dentry *dentry;
-        /* Linked list of entries */
-        struct autofs_dir_ent *next;
-        struct autofs_dir_ent **back;
-        /* The following entries are for the expiry system */
-        unsigned long last_usage;
-        struct list_head exp;
-};
-struct autofs_dirhash {
-        struct autofs_dir_ent *h[AUTOFS_HASH_SIZE];
-        struct list_head expiry_head;
-};
-struct autofs_wait_queue {
-        wait_queue_head_t queue;
-        struct autofs_wait_queue *next;
-        autofs_wqt_t wait_queue_token;
-        /* We use the following to see what we are waiting for */
-        int hash;
-        int len;
-        char *name;
-        /* This is for status reporting upon return */
-        int status;
-        int wait_ctr;
-};
-struct autofs_symlink {
-        char *data;
-        int len;
-        time_t mtime;
-};
-#define AUTOFS_MAX_SYMLINKS 256
-#define AUTOFS_ROOT_INO      1
-#define AUTOFS_FIRST_SYMLINK 2
-#define AUTOFS_FIRST_DIR_INO (AUTOFS_FIRST_SYMLINK+AUTOFS_MAX_SYMLINKS)
-#define AUTOFS_SYMLINK_BITMAP_LEN \
-        ((AUTOFS_MAX_SYMLINKS+((sizeof(long)*1)-1))/(sizeof(long)*8))
-#define AUTOFS_SBI_MAGIC 0x6d4a556d
-struct autofs_sb_info {
-        u32 magic;
-        struct file *pipe;
-        struct pid *oz_pgrp;
-        int catatonic;
-        struct super_block *sb;
-        unsigned long exp_timeout;
-        ino_t next_dir_ino;
-        struct autofs_wait_queue *queues; /* Wait queue pointer */
-        struct autofs_dirhash dirhash; /* Root directory hash */
-        struct autofs_symlink symlink[AUTOFS_MAX_SYMLINKS];
-        unsigned long symlink_bitmap[AUTOFS_SYMLINK_BITMAP_LEN];
-};
-static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
-{
-        return (struct autofs_sb_info *)(sb->s_fs_info);
-}
-/* autofs_oz_mode(): do we see the man behind the curtain?  (The
-   processes which do manipulations for us in user space sees the raw
-   filesystem without "magic".) */
-static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
-        return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
-}
-/* Hash operations */
-void autofs_initialize_hash(struct autofs_dirhash *);
-struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *,struct qstr *);
-void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
-void autofs_hash_delete(struct autofs_dir_ent *);
-struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
-void autofs_hash_dputall(struct autofs_dirhash *);
-void autofs_hash_nuke(struct autofs_sb_info *);
-/* Expiration-handling functions */
-void autofs_update_usage(struct autofs_dirhash *,struct autofs_dir_ent *);
-struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info *, struct vfsmount *mnt);
-/* Operations structures */
-extern const struct inode_operations autofs_root_inode_operations;
-extern const struct inode_operations autofs_symlink_inode_operations;
-extern const struct file_operations autofs_root_operations;
-/* Initializing function */
-int autofs_fill_super(struct super_block *, void *, int);
-void autofs_kill_sb(struct super_block *sb);
-struct inode *autofs_iget(struct super_block *, unsigned long);
-/* Queue management functions */
-int autofs_wait(struct autofs_sb_info *,struct qstr *);
-int autofs_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
-void autofs_catatonic_mode(struct autofs_sb_info *);
-#ifdef DEBUG
-void autofs_say(const char *name, int len);
-#else
-#define autofs_say(n,l) ((void)0)
-#endif
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
deleted file mode 100644
index e947915109e5..000000000000
--- a/fs/autofs/dirhash.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/dirhash.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include "autofs_i.h"
-/* Functions for maintenance of expiry queue */
-static void autofs_init_usage(struct autofs_dirhash *dh,
-                              struct autofs_dir_ent *ent)
-{
-        list_add_tail(&ent->exp, &dh->expiry_head);
-        ent->last_usage = jiffies;
-}
-static void autofs_delete_usage(struct autofs_dir_ent *ent)
-{
-        list_del(&ent->exp);
-}
-void autofs_update_usage(struct autofs_dirhash *dh,
-                         struct autofs_dir_ent *ent)
-{
-        autofs_delete_usage(ent);   /* Unlink from current position */
-        autofs_init_usage(dh,ent);  /* Relink at queue tail */
-}
-struct autofs_dir_ent *autofs_expire(struct super_block *sb,
-                                     struct autofs_sb_info *sbi,
-                                     struct vfsmount *mnt)
-{
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        unsigned long timeout = sbi->exp_timeout;
-        while (1) {
-                struct path path;
-                int umount_ok;
-                if ( list_empty(&dh->expiry_head) || sbi->catatonic )
-                        return NULL;    /* No entries */
-                /* We keep the list sorted by last_usage and want old stuff */
-                ent = list_entry(dh->expiry_head.next, struct autofs_dir_ent, exp);
-                if (jiffies - ent->last_usage < timeout)
-                        break;
-                /* Move to end of list in case expiry isn't desirable */
-                autofs_update_usage(dh, ent);
-                /* Check to see that entry is expirable */
-                if ( ent->ino < AUTOFS_FIRST_DIR_INO )
-                        return ent; /* Symlinks are always expirable */
-                /* Get the dentry for the autofs subdirectory */
-                path.dentry = ent->dentry;
-                if (!path.dentry) {
-                        /* Should only happen in catatonic mode */
-                        printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
-                        autofs_delete_usage(ent);
-                        continue;
-                }
-                if (!path.dentry->d_inode) {
-                        dput(path.dentry);
-                        printk("autofs: negative dentry on expiry queue: %s\n",
-                               ent->name);
-                        autofs_delete_usage(ent);
-                        continue;
-                }
-                /* Make sure entry is mounted and unused; note that dentry will
-                   point to the mounted-on-top root. */
-                if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
-                    !d_mountpoint(path.dentry)) {
-                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
-                        continue;
-                }
-                path.mnt = mnt;
-                path_get(&path);
-                if (!follow_down(&path)) {
-                        path_put(&path);
-                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
-                        continue;
-                }
-                while (d_mountpoint(path.dentry) && follow_down(&path))
-                        ;
-                umount_ok = may_umount(path.mnt);
-                path_put(&path);
-                if (umount_ok) {
-                        DPRINTK(("autofs: signaling expire on %s\n", ent->name));
-                        return ent; /* Expirable! */
-                }
-                DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
-        }
-        return NULL;            /* No expirable entries */
-}
-void autofs_initialize_hash(struct autofs_dirhash *dh) {
-        memset(&dh->h, 0, AUTOFS_HASH_SIZE*sizeof(struct autofs_dir_ent *));
-        INIT_LIST_HEAD(&dh->expiry_head);
-}
-struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *dh, struct qstr *name)
-{
-        struct autofs_dir_ent *dhn;
-        DPRINTK(("autofs_hash_lookup: hash = 0x%08x, name = ", name->hash));
-        autofs_say(name->name,name->len);
-        for ( dhn = dh->h[(unsigned) name->hash % AUTOFS_HASH_SIZE] ; dhn ; dhn = dhn->next ) {
-                if ( name->hash == dhn->hash &&
-                     name->len == dhn->len &&
-                     !memcmp(name->name, dhn->name, name->len) )
-                        break;
-        }
-        return dhn;
-}
-void autofs_hash_insert(struct autofs_dirhash *dh, struct autofs_dir_ent *ent)
-{
-        struct autofs_dir_ent **dhnp;
-        DPRINTK(("autofs_hash_insert: hash = 0x%08x, name = ", ent->hash));
-        autofs_say(ent->name,ent->len);
-        autofs_init_usage(dh,ent);
-        if (ent->dentry)
-                dget(ent->dentry);
-        dhnp = &dh->h[(unsigned) ent->hash % AUTOFS_HASH_SIZE];
-        ent->next = *dhnp;
-        ent->back = dhnp;
-        *dhnp = ent;
-        if ( ent->next )
-                ent->next->back = &(ent->next);
-}
-void autofs_hash_delete(struct autofs_dir_ent *ent)
-{
-        *(ent->back) = ent->next;
-        if ( ent->next )
-                ent->next->back = ent->back;
-        autofs_delete_usage(ent);
-        if ( ent->dentry )
-                dput(ent->dentry);
-        kfree(ent->name);
-        kfree(ent);
-}
-/*
- * Used by readdir().  We must validate "ptr", so we can't simply make it
- * a pointer.  Values below 0xffff are reserved; calling with any value
- * <= 0x10000 will return the first entry found.
- *
- * "last" can be NULL or the value returned by the last search *if* we
- * want the next sequential entry.
- */
-struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *dh,
-                                        off_t *ptr, struct autofs_dir_ent *last)
-{
-        int bucket, ecount, i;
-        struct autofs_dir_ent *ent;
-        bucket = (*ptr >> 16) - 1;
-        ecount = *ptr & 0xffff;
-        if ( bucket < 0 ) {
-                bucket = ecount = 0;
-        } 
-        DPRINTK(("autofs_hash_enum: bucket %d, entry %d\n", bucket, ecount));
-        ent = last ? last->next : NULL;
-        if ( ent ) {
-                ecount++;
-        } else {
-                while  ( bucket < AUTOFS_HASH_SIZE ) {
-                        ent = dh->h[bucket];
-                        for ( i = ecount ; ent && i ; i-- )
-                                ent = ent->next;
-                        
-                        if (ent) {
-                                ecount++; /* Point to *next* entry */
-                                break;
-                        }
-                        
-                        bucket++; ecount = 0;
-                }
-        }
-#ifdef DEBUG
-        if ( !ent )
-                printk("autofs_hash_enum: nothing found\n");
-        else {
-                printk("autofs_hash_enum: found hash %08x, name", ent->hash);
-                autofs_say(ent->name,ent->len);
-        }
-#endif
-        *ptr = ((bucket+1) << 16) + ecount;
-        return ent;
-}
-/* Iterate over all the ents, and remove all dentry pointers.  Used on
-   entering catatonic mode, in order to make the filesystem unmountable. */
-void autofs_hash_dputall(struct autofs_dirhash *dh)
-{
-        int i;
-        struct autofs_dir_ent *ent;
-        for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-                for ( ent = dh->h[i] ; ent ; ent = ent->next ) {
-                        if ( ent->dentry ) {
-                                dput(ent->dentry);
-                                ent->dentry = NULL;
-                        }
-                }
-        }
-}
-/* Delete everything.  This is used on filesystem destruction, so we
-   make no attempt to keep the pointers valid */
-void autofs_hash_nuke(struct autofs_sb_info *sbi)
-{
-        int i;
-        struct autofs_dir_ent *ent, *nent;
-        for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-                for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
-                        nent = ent->next;
-                        if ( ent->dentry )
-                                dput(ent->dentry);
-                        kfree(ent->name);
-                        kfree(ent);
-                }
-        }
-}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
deleted file mode 100644
index cea5219b4f37..000000000000
--- a/fs/autofs/init.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include "autofs_i.h"
-static int autofs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
-}
-static struct file_system_type autofs_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "autofs",
-        .get_sb         = autofs_get_sb,
-        .kill_sb        = autofs_kill_sb,
-};
-static int __init init_autofs_fs(void)
-{
-        return register_filesystem(&autofs_fs_type);
-}
-static void __exit exit_autofs_fs(void)
-{
-        unregister_filesystem(&autofs_fs_type);
-}
-module_init(init_autofs_fs);
-module_exit(exit_autofs_fs);
-#ifdef DEBUG
-void autofs_say(const char *name, int len)
-{
-        printk("(%d: ", len);
-        while ( len-- )
-                printk("%c", *name++);
-        printk(")\n");
-}
-#endif
-MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
deleted file mode 100644
index e1734f2d6e26..000000000000
--- a/fs/autofs/inode.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/parser.h>
-#include <linux/bitops.h>
-#include <linux/magic.h>
-#include "autofs_i.h"
-#include <linux/module.h>
-void autofs_kill_sb(struct super_block *sb)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(sb);
-        unsigned int n;
-        /*
-         * In the event of a failure in get_sb_nodev the superblock
-         * info is not present so nothing else has been setup, so
-         * just call kill_anon_super when we are called from
-         * deactivate_super.
-         */
-        if (!sbi)
-                goto out_kill_sb;
-        if (!sbi->catatonic)
-                autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
-        put_pid(sbi->oz_pgrp);
-        autofs_hash_nuke(sbi);
-        for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
-                if (test_bit(n, sbi->symlink_bitmap))
-                        kfree(sbi->symlink[n].data);
-        }
-        kfree(sb->s_fs_info);
-out_kill_sb:
-        DPRINTK(("autofs: shutting down\n"));
-        kill_anon_super(sb);
-}
-static const struct super_operations autofs_sops = {
-        .statfs         = simple_statfs,
-        .show_options   = generic_show_options,
-};
-enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
-static const match_table_t autofs_tokens = {
-        {Opt_fd, "fd=%u"},
-        {Opt_uid, "uid=%u"},
-        {Opt_gid, "gid=%u"},
-        {Opt_pgrp, "pgrp=%u"},
-        {Opt_minproto, "minproto=%u"},
-        {Opt_maxproto, "maxproto=%u"},
-        {Opt_err, NULL}
-};
-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
-                pid_t *pgrp, int *minproto, int *maxproto)
-{
-        char *p;
-        substring_t args[MAX_OPT_ARGS];
-        int option;
-        *uid = current_uid();
-        *gid = current_gid();
-        *pgrp = task_pgrp_nr(current);
-        *minproto = *maxproto = AUTOFS_PROTO_VERSION;
-        *pipefd = -1;
-        if (!options)
-                return 1;
-        while ((p = strsep(&options, ",")) != NULL) {
-                int token;
-                if (!*p)
-                        continue;
-                token = match_token(p, autofs_tokens, args);
-                switch (token) {
-                case Opt_fd:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *pipefd = option;
-                        break;
-                case Opt_uid:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *uid = option;
-                        break;
-                case Opt_gid:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *gid = option;
-                        break;
-                case Opt_pgrp:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *pgrp = option;
-                        break;
-                case Opt_minproto:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *minproto = option;
-                        break;
-                case Opt_maxproto:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *maxproto = option;
-                        break;
-                default:
-                        return 1;
-                }
-        }
-        return (*pipefd < 0);
-}
-int autofs_fill_super(struct super_block *s, void *data, int silent)
-{
-        struct inode * root_inode;
-        struct dentry * root;
-        struct file * pipe;
-        int pipefd;
-        struct autofs_sb_info *sbi;
-        int minproto, maxproto;
-        pid_t pgid;
-        save_mount_options(s, data);
-        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-        if (!sbi)
-                goto fail_unlock;
-        DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
-        s->s_fs_info = sbi;
-        sbi->magic = AUTOFS_SBI_MAGIC;
-        sbi->pipe = NULL;
-        sbi->catatonic = 1;
-        sbi->exp_timeout = 0;
-        autofs_initialize_hash(&sbi->dirhash);
-        sbi->queues = NULL;
-        memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
-        sbi->next_dir_ino = AUTOFS_FIRST_DIR_INO;
-        s->s_blocksize = 1024;
-        s->s_blocksize_bits = 10;
-        s->s_magic = AUTOFS_SUPER_MAGIC;
-        s->s_op = &autofs_sops;
-        s->s_time_gran = 1;
-        sbi->sb = s;
-        root_inode = autofs_iget(s, AUTOFS_ROOT_INO);
-        if (IS_ERR(root_inode))
-                goto fail_free;
-        root = d_alloc_root(root_inode);
-        pipe = NULL;
-        if (!root)
-                goto fail_iput;
-        /* Can this call block?  - WTF cares? s is locked. */
-        if (parse_options(data, &pipefd, &root_inode->i_uid,
-                                &root_inode->i_gid, &pgid, &minproto,
-                                &maxproto)) {
-                printk("autofs: called with bogus options\n");
-                goto fail_dput;
-        }
-        /* Couldn't this be tested earlier? */
-        if (minproto > AUTOFS_PROTO_VERSION ||
-             maxproto < AUTOFS_PROTO_VERSION) {
-                printk("autofs: kernel does not match daemon version\n");
-                goto fail_dput;
-        }
-        DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
-        sbi->oz_pgrp = find_get_pid(pgid);
-        if (!sbi->oz_pgrp) {
-                printk("autofs: could not find process group %d\n", pgid);
-                goto fail_dput;
-        }
-        pipe = fget(pipefd);
-        
-        if (!pipe) {
-                printk("autofs: could not open pipe file descriptor\n");
-                goto fail_put_pid;
-        }
-        if (!pipe->f_op || !pipe->f_op->write)
-                goto fail_fput;
-        sbi->pipe = pipe;
-        sbi->catatonic = 0;
-        /*
-         * Success! Install the root dentry now to indicate completion.
-         */
-        s->s_root = root;
-        return 0;
-fail_fput:
-        printk("autofs: pipe file descriptor does not contain proper ops\n");
-        fput(pipe);
-fail_put_pid:
-        put_pid(sbi->oz_pgrp);
-fail_dput:
-        dput(root);
-        goto fail_free;
-fail_iput:
-        printk("autofs: get root dentry failed\n");
-        iput(root_inode);
-fail_free:
-        kfree(sbi);
-        s->s_fs_info = NULL;
-fail_unlock:
-        return -EINVAL;
-}
-struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
-{
-        unsigned int n;
-        struct autofs_sb_info *sbi = autofs_sbi(sb);
-        struct inode *inode;
-        inode = iget_locked(sb, ino);
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
-        if (!(inode->i_state & I_NEW))
-                return inode;
-        /* Initialize to the default case (stub directory) */
-        inode->i_op = &simple_dir_inode_operations;
-        inode->i_fop = &simple_dir_operations;
-        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-        inode->i_nlink = 2;
-        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        if (ino == AUTOFS_ROOT_INO) {
-                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
-                inode->i_op = &autofs_root_inode_operations;
-                inode->i_fop = &autofs_root_operations;
-                goto done;
-        } 
-        
-        inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
-        inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
-        
-        if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
-                /* Symlink inode - should be in symlink list */
-                struct autofs_symlink *sl;
-                n = ino - AUTOFS_FIRST_SYMLINK;
-                if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
-                        printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
-                        goto done;
-                }
-                
-                inode->i_op = &autofs_symlink_inode_operations;
-                sl = &sbi->symlink[n];
-                inode->i_private = sl;
-                inode->i_mode = S_IFLNK | S_IRWXUGO;
-                inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
-                inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
-                inode->i_size = sl->len;
-                inode->i_nlink = 1;
-        }
-done:
-        unlock_new_inode(inode);
-        return inode;
-}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
deleted file mode 100644
index 11b1ea786d00..000000000000
--- a/fs/autofs/root.c
+++ /dev/null
@@ -1,643 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/stat.h>
-#include <linux/slab.h>
-#include <linux/param.h>
-#include <linux/time.h>
-#include <linux/compat.h>
-#include <linux/smp_lock.h>
-#include "autofs_i.h"
-static int autofs_root_readdir(struct file *,void *,filldir_t);
-static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *);
-static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
-static int autofs_root_unlink(struct inode *,struct dentry *);
-static int autofs_root_rmdir(struct inode *,struct dentry *);
-static int autofs_root_mkdir(struct inode *,struct dentry *,int);
-static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
-static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
-const struct file_operations autofs_root_operations = {
-        .llseek         = generic_file_llseek,
-        .read           = generic_read_dir,
-        .readdir        = autofs_root_readdir,
-        .unlocked_ioctl = autofs_root_ioctl,
-#ifdef CONFIG_COMPAT
-        .compat_ioctl   = autofs_root_compat_ioctl,
-#endif
-};
-const struct inode_operations autofs_root_inode_operations = {
-        .lookup         = autofs_root_lookup,
-        .unlink         = autofs_root_unlink,
-        .symlink        = autofs_root_symlink,
-        .mkdir          = autofs_root_mkdir,
-        .rmdir          = autofs_root_rmdir,
-};
-static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct autofs_dir_ent *ent = NULL;
-        struct autofs_dirhash *dirhash;
-        struct autofs_sb_info *sbi;
-        struct inode * inode = filp->f_path.dentry->d_inode;
-        off_t onr, nr;
-        lock_kernel();
-        sbi = autofs_sbi(inode->i_sb);
-        dirhash = &sbi->dirhash;
-        nr = filp->f_pos;
-        switch(nr)
-        {
-        case 0:
-                if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = ++nr;
-                /* fall through */
-        case 1:
-                if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = ++nr;
-                /* fall through */
-        default:
-                while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
-                        if (!ent->dentry || d_mountpoint(ent->dentry)) {
-                                if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
-                                        goto out;
-                                filp->f_pos = nr;
-                        }
-                }
-                break;
-        }
-out:
-        unlock_kernel();
-        return 0;
-}
-static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, struct autofs_sb_info *sbi)
-{
-        struct inode * inode;
-        struct autofs_dir_ent *ent;
-        int status = 0;
-        if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
-                do {
-                        if (status && dentry->d_inode) {
-                                if (status != -ENOENT)
-                                        printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
-                                return 0; /* Try to get the kernel to invalidate this dentry */
-                        }
-                        /* Turn this into a real negative dentry? */
-                        if (status == -ENOENT) {
-                                dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT;
-                                dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-                                return 1;
-                        } else if (status) {
-                                /* Return a negative dentry, but leave it "pending" */
-                                return 1;
-                        }
-                        status = autofs_wait(sbi, &dentry->d_name);
-                } while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
-        }
-        /* Abuse this field as a pointer to the directory entry, used to
-           find the expire list pointers */
-        dentry->d_time = (unsigned long) ent;
-        
-        if (!dentry->d_inode) {
-                inode = autofs_iget(sb, ent->ino);
-                if (IS_ERR(inode)) {
-                        /* Failed, but leave pending for next time */
-                        return 1;
-                }
-                dentry->d_inode = inode;
-        }
-        /* If this is a directory that isn't a mount point, bitch at the
-           daemon and fix it in user space */
-        if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
-                return !autofs_wait(sbi, &dentry->d_name);
-        }
-        /* We don't update the usages for the autofs daemon itself, this
-           is necessary for recursive autofs mounts */
-        if (!autofs_oz_mode(sbi)) {
-                autofs_update_usage(&sbi->dirhash,ent);
-        }
-        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-        return 1;
-}
-/*
- * Revalidate is called on every cache lookup.  Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
-{
-        struct inode * dir;
-        struct autofs_sb_info *sbi;
-        struct autofs_dir_ent *ent;
-        int res;
-        lock_kernel();
-        dir = dentry->d_parent->d_inode;
-        sbi = autofs_sbi(dir->i_sb);
-        /* Pending dentry */
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
-                if (autofs_oz_mode(sbi))
-                        res = 1;
-                else
-                        res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
-                unlock_kernel();
-                return res;
-        }
-        /* Negative dentry.. invalidate if "old" */
-        if (!dentry->d_inode) {
-                unlock_kernel();
-                return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT);
-        }
-                
-        /* Check for a non-mountpoint directory */
-        if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
-                if (autofs_oz_mode(sbi))
-                        res = 1;
-                else
-                        res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
-                unlock_kernel();
-                return res;
-        }
-        /* Update the usage list */
-        if (!autofs_oz_mode(sbi)) {
-                ent = (struct autofs_dir_ent *) dentry->d_time;
-                if (ent)
-                        autofs_update_usage(&sbi->dirhash,ent);
-        }
-        unlock_kernel();
-        return 1;
-}
-static const struct dentry_operations autofs_dentry_operations = {
-        .d_revalidate   = autofs_revalidate,
-};
-static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        struct autofs_sb_info *sbi;
-        int oz_mode;
-        DPRINTK(("autofs_root_lookup: name = "));
-        lock_kernel();
-        autofs_say(dentry->d_name.name,dentry->d_name.len);
-        if (dentry->d_name.len > NAME_MAX) {
-                unlock_kernel();
-                return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */
-        }
-        sbi = autofs_sbi(dir->i_sb);
-        oz_mode = autofs_oz_mode(sbi);
-        DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
-                                "oz_mode = %d\n", task_pid_nr(current),
-                                task_pgrp_nr(current), sbi->catatonic,
-                                oz_mode));
-        /*
-         * Mark the dentry incomplete, but add it. This is needed so
-         * that the VFS layer knows about the dentry, and we can count
-         * on catching any lookups through the revalidate.
-         *
-         * Let all the hard work be done by the revalidate function that
-         * needs to be able to do this anyway..
-         *
-         * We need to do this before we release the directory semaphore.
-         */
-        dentry->d_op = &autofs_dentry_operations;
-        dentry->d_flags |= DCACHE_AUTOFS_PENDING;
-        d_add(dentry, NULL);
-        mutex_unlock(&dir->i_mutex);
-        autofs_revalidate(dentry, nd);
-        mutex_lock(&dir->i_mutex);
-        /*
-         * If we are still pending, check if we had to handle
-         * a signal. If so we can force a restart..
-         */
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
-                /* See if we were interrupted */
-                if (signal_pending(current)) {
-                        sigset_t *sigset = &current->pending.signal;
-                        if (sigismember (sigset, SIGKILL) ||
-                            sigismember (sigset, SIGQUIT) ||
-                            sigismember (sigset, SIGINT)) {
-                                unlock_kernel();
-                                return ERR_PTR(-ERESTARTNOINTR);
-                        }
-                }
-        }
-        unlock_kernel();
-        /*
-         * If this dentry is unhashed, then we shouldn't honour this
-         * lookup even if the dentry is positive.  Returning ENOENT here
-         * doesn't do the right thing for all system calls, but it should
-         * be OK for the operations we permit from an autofs.
-         */
-        if (dentry->d_inode && d_unhashed(dentry))
-                return ERR_PTR(-ENOENT);
-        return NULL;
-}
-static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        unsigned int n;
-        int slsize;
-        struct autofs_symlink *sl;
-        struct inode *inode;
-        DPRINTK(("autofs_root_symlink: %s <- ", symname));
-        autofs_say(dentry->d_name.name,dentry->d_name.len);
-        lock_kernel();
-        if (!autofs_oz_mode(sbi)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        if (autofs_hash_lookup(dh, &dentry->d_name)) {
-                unlock_kernel();
-                return -EEXIST;
-        }
-        n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
-        if (n >= AUTOFS_MAX_SYMLINKS) {
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        set_bit(n,sbi->symlink_bitmap);
-        sl = &sbi->symlink[n];
-        sl->len = strlen(symname);
-        sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
-        if (!sl->data) {
-                clear_bit(n,sbi->symlink_bitmap);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-        if (!ent) {
-                kfree(sl->data);
-                clear_bit(n,sbi->symlink_bitmap);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-        if (!ent->name) {
-                kfree(sl->data);
-                kfree(ent);
-                clear_bit(n,sbi->symlink_bitmap);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        memcpy(sl->data,symname,slsize);
-        sl->mtime = get_seconds();
-        ent->ino = AUTOFS_FIRST_SYMLINK + n;
-        ent->hash = dentry->d_name.hash;
-        memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
-        ent->dentry = NULL;     /* We don't keep the dentry for symlinks */
-        autofs_hash_insert(dh,ent);
-        inode = autofs_iget(dir->i_sb, ent->ino);
-        if (IS_ERR(inode))
-                return PTR_ERR(inode);
-        d_instantiate(dentry, inode);
-        unlock_kernel();
-        return 0;
-}
-/*
- * NOTE!
- *
- * Normal filesystems would do a "d_delete()" to tell the VFS dcache
- * that the file no longer exists. However, doing that means that the
- * VFS layer can turn the dentry into a negative dentry, which we
- * obviously do not want (we're dropping the entry not because it
- * doesn't exist, but because it has timed out).
- *
- * Also see autofs_root_rmdir()..
- */
-static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        unsigned int n;
-        /* This allows root to remove symlinks */
-        lock_kernel();
-        if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        ent = autofs_hash_lookup(dh, &dentry->d_name);
-        if (!ent) {
-                unlock_kernel();
-                return -ENOENT;
-        }
-        n = ent->ino - AUTOFS_FIRST_SYMLINK;
-        if (n >= AUTOFS_MAX_SYMLINKS) {
-                unlock_kernel();
-                return -EISDIR; /* It's a directory, dummy */
-        }
-        if (!test_bit(n,sbi->symlink_bitmap)) {
-                unlock_kernel();
-                return -EINVAL; /* Nonexistent symlink?  Shouldn't happen */
-        }
-        
-        dentry->d_time = (unsigned long)(struct autofs_dirhash *)NULL;
-        autofs_hash_delete(ent);
-        clear_bit(n,sbi->symlink_bitmap);
-        kfree(sbi->symlink[n].data);
-        d_drop(dentry);
-        
-        unlock_kernel();
-        return 0;
-}
-static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        lock_kernel();
-        if (!autofs_oz_mode(sbi)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        ent = autofs_hash_lookup(dh, &dentry->d_name);
-        if (!ent) {
-                unlock_kernel();
-                return -ENOENT;
-        }
-        if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
-                unlock_kernel();
-                return -ENOTDIR; /* Not a directory */
-        }
-        if (ent->dentry != dentry) {
-                printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
-        }
-        dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
-        autofs_hash_delete(ent);
-        drop_nlink(dir);
-        d_drop(dentry);
-        unlock_kernel();
-        return 0;
-}
-static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        struct inode *inode;
-        ino_t ino;
-        lock_kernel();
-        if (!autofs_oz_mode(sbi)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        ent = autofs_hash_lookup(dh, &dentry->d_name);
-        if (ent) {
-                unlock_kernel();
-                return -EEXIST;
-        }
-        if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
-                printk("autofs: Out of inode numbers -- what the heck did you do??\n");
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ino = sbi->next_dir_ino++;
-        ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-        if (!ent) {
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-        if (!ent->name) {
-                kfree(ent);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent->hash = dentry->d_name.hash;
-        memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
-        ent->ino = ino;
-        ent->dentry = dentry;
-        autofs_hash_insert(dh,ent);
-        inc_nlink(dir);
-        inode = autofs_iget(dir->i_sb, ino);
-        if (IS_ERR(inode)) {
-                drop_nlink(dir);
-                return PTR_ERR(inode);
-        }
-        d_instantiate(dentry, inode);
-        unlock_kernel();
-        return 0;
-}
-/* Get/set timeout ioctl() operation */
-#ifdef CONFIG_COMPAT
-static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
-                                         unsigned int __user *p)
-{
-        unsigned long ntimeout;
-        if (get_user(ntimeout, p) ||
-            put_user(sbi->exp_timeout / HZ, p))
-                return -EFAULT;
-        if (ntimeout > UINT_MAX/HZ)
-                sbi->exp_timeout = 0;
-        else
-                sbi->exp_timeout = ntimeout * HZ;
-        return 0;
-}
-#endif
-static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
-                                         unsigned long __user *p)
-{
-        unsigned long ntimeout;
-        if (get_user(ntimeout, p) ||
-            put_user(sbi->exp_timeout / HZ, p))
-                return -EFAULT;
-        if (ntimeout > ULONG_MAX/HZ)
-                sbi->exp_timeout = 0;
-        else
-                sbi->exp_timeout = ntimeout * HZ;
-        return 0;
-}
-/* Return protocol version */
-static inline int autofs_get_protover(int __user *p)
-{
-        return put_user(AUTOFS_PROTO_VERSION, p);
-}
-/* Perform an expiry operation */
-static inline int autofs_expire_run(struct super_block *sb,
-                                    struct autofs_sb_info *sbi,
-                                    struct vfsmount *mnt,
-                                    struct autofs_packet_expire __user *pkt_p)
-{
-        struct autofs_dir_ent *ent;
-        struct autofs_packet_expire pkt;
-        memset(&pkt,0,sizeof pkt);
-        pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
-        pkt.hdr.type = autofs_ptype_expire;
-        if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
-                return -EAGAIN;
-        pkt.len = ent->len;
-        memcpy(pkt.name, ent->name, pkt.len);
-        pkt.name[pkt.len] = '\0';
-        if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
-                return -EFAULT;
-        return 0;
-}
-/*
- * ioctl()'s on the root directory is the chief method for the daemon to
- * generate kernel reactions
- */
-static int autofs_do_root_ioctl(struct inode *inode, struct file *filp,
-                             unsigned int cmd, unsigned long arg)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
-        void __user *argp = (void __user *)arg;
-        DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current)));
-        if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
-             _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
-                return -ENOTTY;
-        
-        if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        
-        switch(cmd) {
-        case AUTOFS_IOC_READY:  /* Wait queue: go ahead and retry */
-                return autofs_wait_release(sbi,(autofs_wqt_t)arg,0);
-        case AUTOFS_IOC_FAIL:   /* Wait queue: fail with ENOENT */
-                return autofs_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
-        case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
-                autofs_catatonic_mode(sbi);
-                return 0;
-        case AUTOFS_IOC_PROTOVER: /* Get protocol version */
-                return autofs_get_protover(argp);
-#ifdef CONFIG_COMPAT
-        case AUTOFS_IOC_SETTIMEOUT32:
-                return autofs_compat_get_set_timeout(sbi, argp);
-#endif
-        case AUTOFS_IOC_SETTIMEOUT:
-                return autofs_get_set_timeout(sbi, argp);
-        case AUTOFS_IOC_EXPIRE:
-                return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt,
-                                         argp);
-        default:
-                return -ENOSYS;
-        }
-}
-static long autofs_root_ioctl(struct file *filp,
-                             unsigned int cmd, unsigned long arg)
-{
-        int ret;
-        lock_kernel();
-        ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode,
-                                   filp, cmd, arg);
-        unlock_kernel();
-        return ret;
-}
-#ifdef CONFIG_COMPAT
-static long autofs_root_compat_ioctl(struct file *filp,
-                             unsigned int cmd, unsigned long arg)
-{
-        struct inode *inode = filp->f_path.dentry->d_inode;
-        int ret;
-        lock_kernel();
-        if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
-                ret = autofs_do_root_ioctl(inode, filp, cmd, arg);
-        else
-                ret = autofs_do_root_ioctl(inode, filp, cmd,
-                        (unsigned long)compat_ptr(arg));
-        unlock_kernel();
-        return ret;
-}
-#endif
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
deleted file mode 100644
index 7ce9cb2c9ce2..000000000000
--- a/fs/autofs/symlink.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include "autofs_i.h"
-/* Nothing to release.. */
-static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
-        nd_set_link(nd, s);
-        return NULL;
-}
-const struct inode_operations autofs_symlink_inode_operations = {
-        .readlink       = generic_readlink,
-        .follow_link    = autofs_follow_link
-};
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
deleted file mode 100644
index be46805972f0..000000000000
--- a/fs/autofs/waitq.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/signal.h>
-#include <linux/file.h>
-#include "autofs_i.h"
-/* We make this a static variable rather than a part of the superblock; it
-   is better if we don't reassign numbers easily even across filesystems */
-static autofs_wqt_t autofs_next_wait_queue = 1;
-/* These are the signals we allow interrupting a pending mount */
-#define SHUTDOWN_SIGS   (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
-void autofs_catatonic_mode(struct autofs_sb_info *sbi)
-{
-        struct autofs_wait_queue *wq, *nwq;
-        DPRINTK(("autofs: entering catatonic mode\n"));
-        sbi->catatonic = 1;
-        wq = sbi->queues;
-        sbi->queues = NULL;     /* Erase all wait queues */
-        while ( wq ) {
-                nwq = wq->next;
-                wq->status = -ENOENT; /* Magic is gone - report failure */
-                kfree(wq->name);
-                wq->name = NULL;
-                wake_up(&wq->queue);
-                wq = nwq;
-        }
-        fput(sbi->pipe);        /* Close the pipe */
-        sbi->pipe = NULL;
-        autofs_hash_dputall(&sbi->dirhash); /* Remove all dentry pointers */
-}
-static int autofs_write(struct file *file, const void *addr, int bytes)
-{
-        unsigned long sigpipe, flags;
-        mm_segment_t fs;
-        const char *data = (const char *)addr;
-        ssize_t wr = 0;
-        /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
-        sigpipe = sigismember(&current->pending.signal, SIGPIPE);
-        /* Save pointer to user space and point back to kernel space */
-        fs = get_fs();
-        set_fs(KERNEL_DS);
-        while (bytes &&
-               (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
-                data += wr;
-                bytes -= wr;
-        }
-        set_fs(fs);
-        /* Keep the currently executing process from receiving a
-           SIGPIPE unless it was already supposed to get one */
-        if (wr == -EPIPE && !sigpipe) {
-                spin_lock_irqsave(&current->sighand->siglock, flags);
-                sigdelset(&current->pending.signal, SIGPIPE);
-                recalc_sigpending();
-                spin_unlock_irqrestore(&current->sighand->siglock, flags);
-        }
-        return (bytes > 0);
-}
-        
-static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq)
-{
-        struct autofs_packet_missing pkt;
-        DPRINTK(("autofs_wait: wait id = 0x%08lx, name = ", wq->wait_queue_token));
-        autofs_say(wq->name,wq->len);
-        memset(&pkt,0,sizeof pkt); /* For security reasons */
-        pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
-        pkt.hdr.type = autofs_ptype_missing;
-        pkt.wait_queue_token = wq->wait_queue_token;
-        pkt.len = wq->len;
-        memcpy(pkt.name, wq->name, pkt.len);
-        pkt.name[pkt.len] = '\0';
-        if ( autofs_write(sbi->pipe,&pkt,sizeof(struct autofs_packet_missing)) )
-                autofs_catatonic_mode(sbi);
-}
-int autofs_wait(struct autofs_sb_info *sbi, struct qstr *name)
-{
-        struct autofs_wait_queue *wq;
-        int status;
-        /* In catatonic mode, we don't wait for nobody */
-        if ( sbi->catatonic )
-                return -ENOENT;
-        
-        /* We shouldn't be able to get here, but just in case */
-        if ( name->len > NAME_MAX )
-                return -ENOENT;
-        for ( wq = sbi->queues ; wq ; wq = wq->next ) {
-                if ( wq->hash == name->hash &&
-                     wq->len == name->len &&
-                     wq->name && !memcmp(wq->name,name->name,name->len) )
-                        break;
-        }
-        
-        if ( !wq ) {
-                /* Create a new wait queue */
-                wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
-                if ( !wq )
-                        return -ENOMEM;
-                wq->name = kmalloc(name->len,GFP_KERNEL);
-                if ( !wq->name ) {
-                        kfree(wq);
-                        return -ENOMEM;
-                }
-                wq->wait_queue_token = autofs_next_wait_queue++;
-                init_waitqueue_head(&wq->queue);
-                wq->hash = name->hash;
-                wq->len = name->len;
-                wq->status = -EINTR; /* Status return if interrupted */
-                memcpy(wq->name, name->name, name->len);
-                wq->next = sbi->queues;
-                sbi->queues = wq;
-                /* autofs_notify_daemon() may block */
-                wq->wait_ctr = 2;
-                autofs_notify_daemon(sbi,wq);
-        } else
-                wq->wait_ctr++;
-        /* wq->name is NULL if and only if the lock is already released */
-        if ( sbi->catatonic ) {
-                /* We might have slept, so check again for catatonic mode */
-                wq->status = -ENOENT;
-                kfree(wq->name);
-                wq->name = NULL;
-        }
-        if ( wq->name ) {
-                /* Block all but "shutdown" signals while waiting */
-                sigset_t sigmask;
-                siginitsetinv(&sigmask, SHUTDOWN_SIGS);
-                sigprocmask(SIG_BLOCK, &sigmask, &sigmask);
-                interruptible_sleep_on(&wq->queue);
-                sigprocmask(SIG_SETMASK, &sigmask, NULL);
-        } else {
-                DPRINTK(("autofs_wait: skipped sleeping\n"));
-        }
-        status = wq->status;
-        if ( ! --wq->wait_ctr ) /* Are we the last process to need status? */
-                kfree(wq);
-        return status;
-}
-int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
-{
-        struct autofs_wait_queue *wq, **wql;
-        for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
-                if ( wq->wait_queue_token == wait_queue_token )
-                        break;
-        }
-        if ( !wq )
-                return -EINVAL;
-        *wql = wq->next;        /* Unlink from chain */
-        kfree(wq->name);
-        wq->name = NULL;        /* Do not wait on this queue */
-        wq->status = status;
-        if ( ! --wq->wait_ctr ) /* Is anyone still waiting for this guy? */
-                kfree(wq);
-        else
-                wake_up(&wq->queue);
-        return 0;
-}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3d283abf67d7..54f923792728 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -16,6 +16,7 @@
 #include <linux/auto_fs4.h>
 #include <linux/auto_dev-ioctl.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/list.h>
 /* This is the range of ioctl() numbers we claim as ours */
@@ -60,6 +61,8 @@ do {							\
                current->pid, __func__, ##args);        \
 } while (0)
+extern spinlock_t autofs4_lock;
 /* Unified info structure.  This is pointed to by both the dentry and
   inode structures.  Each file in the filesystem has an instance of this
   structure.  It holds a reference to the dentry, so dentries are never
@@ -85,18 +88,9 @@ struct autofs_info {
        uid_t uid;
        gid_t gid;
-        mode_t  mode;
-        size_t  size;
-        void (*free)(struct autofs_info *);
-        union {
-                const char *symlink;
-        } u;
 };
 #define AUTOFS_INF_EXPIRING     (1<<0) /* dentry is in the process of expiring */
-#define AUTOFS_INF_MOUNTPOINT   (1<<1) /* mountpoint status for direct expire */
 #define AUTOFS_INF_PENDING      (1<<2) /* dentry pending mount */
 struct autofs_wait_queue {
@@ -173,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
        return 0;
 }
-static inline void autofs4_copy_atime(struct file *src, struct file *dst)
+struct inode *autofs4_get_inode(struct super_block *, mode_t);
-{
-        dst->f_path.dentry->d_inode->i_atime =
-                src->f_path.dentry->d_inode->i_atime;
-        return;
-}
-struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
 void autofs4_free_ino(struct autofs_info *);
 /* Expiration */
@@ -209,16 +196,89 @@ void autofs_dev_ioctl_exit(void);
 extern const struct inode_operations autofs4_symlink_inode_operations;
 extern const struct inode_operations autofs4_dir_inode_operations;
-extern const struct inode_operations autofs4_root_inode_operations;
-extern const struct inode_operations autofs4_indirect_root_inode_operations;
-extern const struct inode_operations autofs4_direct_root_inode_operations;
 extern const struct file_operations autofs4_dir_operations;
 extern const struct file_operations autofs4_root_operations;
+extern const struct dentry_operations autofs4_dentry_operations;
+/* VFS automount flags management functions */
+static inline void __managed_dentry_set_automount(struct dentry *dentry)
+{
+        dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+}
+static inline void managed_dentry_set_automount(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_set_automount(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_clear_automount(struct dentry *dentry)
+{
+        dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
+}
+static inline void managed_dentry_clear_automount(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_clear_automount(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_set_transit(struct dentry *dentry)
+{
+        dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
+}
+static inline void managed_dentry_set_transit(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_set_transit(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_clear_transit(struct dentry *dentry)
+{
+        dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
+}
+static inline void managed_dentry_clear_transit(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_clear_transit(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_set_managed(struct dentry *dentry)
+{
+        dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+static inline void managed_dentry_set_managed(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_set_managed(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_clear_managed(struct dentry *dentry)
+{
+        dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+static inline void managed_dentry_clear_managed(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_clear_managed(dentry);
+        spin_unlock(&dentry->d_lock);
+}
 /* Initializing function */
 int autofs4_fill_super(struct super_block *, void *, int);
-struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode);
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
+void autofs4_clean_ino(struct autofs_info *);
 /* Queue management functions */
@@ -226,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
-static inline int autofs4_follow_mount(struct path *path)
-{
-        int res = 0;
-        while (d_mountpoint(path->dentry)) {
-                int followed = follow_down(path);
-                if (!followed)
-                        break;
-                res = 1;
-        }
-        return res;
-}
 static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
 {
        return new_encode_dev(sbi->sb->s_dev);
@@ -254,17 +301,15 @@ static inline int simple_positive(struct dentry *dentry)
        return dentry->d_inode && !d_unhashed(dentry);
 }
-static inline int __simple_empty(struct dentry *dentry)
+static inline void __autofs4_add_expiring(struct dentry *dentry)
 {
-        struct dentry *child;
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        int ret = 0;
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
-        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
+                if (list_empty(&ino->expiring))
-                if (simple_positive(child))
+                        list_add(&ino->expiring, &sbi->expiring_list);
-                        goto out;
+        }
-        ret = 1;
+        return;
-out:
-        return ret;
 }
 static inline void autofs4_add_expiring(struct dentry *dentry)
@@ -293,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
        return;
 }
-void autofs4_dentry_release(struct dentry *);
 extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ba4a38b9c22f..1442da4860e5 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                err = have_submounts(path.dentry);
-                if (follow_down(&path))
+                if (follow_down_one(&path))
                        magic = path.mnt->mnt_sb->s_magic;
        }
@@ -724,6 +724,7 @@ static const struct file_operations _dev_ioctl_fops = {
        .unlocked_ioctl  = autofs_dev_ioctl,
        .compat_ioctl = autofs_dev_ioctl_compat,
        .owner   = THIS_MODULE,
+        .llseek = noop_llseek,
 };
 static struct miscdevice _autofs_dev_ioctl_misc = {
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index a796c9417fb1..f43100b9662b 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
        if (ino == NULL)
                return 0;
-        /* No point expiring a pending mount */
-        if (ino->flags & AUTOFS_INF_PENDING)
-                return 0;
        if (!do_now) {
                /* Too young to die */
                if (!timeout || time_after(ino->last_used + timeout, now))
@@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
        path_get(&path);
-        if (!follow_down(&path))
+        if (!follow_down_one(&path))
                goto done;
        if (is_autofs4_dentry(path.dentry)) {
@@ -91,24 +87,64 @@ done:
 }
 /*
- * Calculate next entry in top down tree traversal.
+ * Calculate and dget next entry in top down tree traversal.
- * From next_mnt in namespace.c - elegant.
 */
-static struct dentry *next_dentry(struct dentry *p, struct dentry *root)
+static struct dentry *get_next_positive_dentry(struct dentry *prev,
+                                                struct dentry *root)
 {
-        struct list_head *next = p->d_subdirs.next;
+        struct list_head *next;
+        struct dentry *p, *ret;
+        if (prev == NULL)
+                return dget(root);
+        spin_lock(&autofs4_lock);
+relock:
+        p = prev;
+        spin_lock(&p->d_lock);
+again:
+        next = p->d_subdirs.next;
        if (next == &p->d_subdirs) {
                while (1) {
-                        if (p == root)
+                        struct dentry *parent;
+                        if (p == root) {
+                                spin_unlock(&p->d_lock);
+                                spin_unlock(&autofs4_lock);
+                                dput(prev);
                                return NULL;
+                        }
+                        parent = p->d_parent;
+                        if (!spin_trylock(&parent->d_lock)) {
+                                spin_unlock(&p->d_lock);
+                                cpu_relax();
+                                goto relock;
+                        }
+                        spin_unlock(&p->d_lock);
                        next = p->d_u.d_child.next;
-                        if (next != &p->d_parent->d_subdirs)
+                        p = parent;
+                        if (next != &parent->d_subdirs)
                                break;
-                        p = p->d_parent;
                }
        }
-        return list_entry(next, struct dentry, d_u.d_child);
+        ret = list_entry(next, struct dentry, d_u.d_child);
+        spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
+        /* Negative dentry - try next */
+        if (!simple_positive(ret)) {
+                spin_unlock(&p->d_lock);
+                p = ret;
+                goto again;
+        }
+        dget_dlock(ret);
+        spin_unlock(&ret->d_lock);
+        spin_unlock(&p->d_lock);
+        spin_unlock(&autofs4_lock);
+        dput(prev);
+        return ret;
 }
 /*
@@ -158,18 +194,11 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
        if (!simple_positive(top))
                return 1;
-        spin_lock(&dcache_lock);
+        p = NULL;
-        for (p = top; p; p = next_dentry(p, top)) {
+        while ((p = get_next_positive_dentry(p, top))) {
-                /* Negative dentry - give up */
-                if (!simple_positive(p))
-                        continue;
                DPRINTK("dentry %p %.*s",
                        p, (int) p->d_name.len, p->d_name.name);
-                p = dget(p);
-                spin_unlock(&dcache_lock);
                /*
                 * Is someone visiting anywhere in the subtree ?
                 * If there's no mount we need to check the usage
@@ -198,16 +227,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
                        else
                                ino_count++;
-                        if (atomic_read(&p->d_count) > ino_count) {
+                        if (p->d_count > ino_count) {
                                top_ino->last_used = jiffies;
                                dput(p);
                                return 1;
                        }
                }
-                dput(p);
-                spin_lock(&dcache_lock);
        }
-        spin_unlock(&dcache_lock);
        /* Timeout of a tree mount is ultimately determined by its top dentry */
        if (!autofs4_can_expire(top, timeout, do_now))
@@ -226,32 +252,21 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
        DPRINTK("parent %p %.*s",
                parent, (int)parent->d_name.len, parent->d_name.name);
-        spin_lock(&dcache_lock);
+        p = NULL;
-        for (p = parent; p; p = next_dentry(p, parent)) {
+        while ((p = get_next_positive_dentry(p, parent))) {
-                /* Negative dentry - give up */
-                if (!simple_positive(p))
-                        continue;
                DPRINTK("dentry %p %.*s",
                        p, (int) p->d_name.len, p->d_name.name);
-                p = dget(p);
-                spin_unlock(&dcache_lock);
                if (d_mountpoint(p)) {
                        /* Can we umount this guy */
                        if (autofs4_mount_busy(mnt, p))
-                                goto cont;
+                                continue;
                        /* Can we expire this guy */
                        if (autofs4_can_expire(p, timeout, do_now))
                                return p;
                }
-cont:
-                dput(p);
-                spin_lock(&dcache_lock);
        }
-        spin_unlock(&dcache_lock);
        return NULL;
 }
@@ -264,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
        unsigned long timeout;
        struct dentry *root = dget(sb->s_root);
        int do_now = how & AUTOFS_EXP_IMMEDIATE;
+        struct autofs_info *ino;
        if (!root)
                return NULL;
@@ -272,17 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
        timeout = sbi->exp_timeout;
        spin_lock(&sbi->fs_lock);
+        ino = autofs4_dentry_ino(root);
+        /* No point expiring a pending mount */
+        if (ino->flags & AUTOFS_INF_PENDING) {
+                spin_unlock(&sbi->fs_lock);
+                return NULL;
+        }
+        managed_dentry_set_transit(root);
        if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
                struct autofs_info *ino = autofs4_dentry_ino(root);
-                if (d_mountpoint(root)) {
-                        ino->flags |= AUTOFS_INF_MOUNTPOINT;
-                        root->d_mounted--;
-                }
                ino->flags |= AUTOFS_INF_EXPIRING;
                init_completion(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                return root;
        }
+        managed_dentry_clear_transit(root);
        spin_unlock(&sbi->fs_lock);
        dput(root);
@@ -302,8 +322,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 {
        unsigned long timeout;
        struct dentry *root = sb->s_root;
+        struct dentry *dentry;
        struct dentry *expired = NULL;
-        struct list_head *next;
        int do_now = how & AUTOFS_EXP_IMMEDIATE;
        int exp_leaves = how & AUTOFS_EXP_LEAVES;
        struct autofs_info *ino;
@@ -315,25 +335,14 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        now = jiffies;
        timeout = sbi->exp_timeout;
-        spin_lock(&dcache_lock);
+        dentry = NULL;
-        next = root->d_subdirs.next;
+        while ((dentry = get_next_positive_dentry(dentry, root))) {
-        /* On exit from the loop expire is set to a dgot dentry
-         * to expire or it's NULL */
-        while ( next != &root->d_subdirs ) {
-                struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
-                /* Negative dentry - give up */
-                if (!simple_positive(dentry)) {
-                        next = next->next;
-                        continue;
-                }
-                dentry = dget(dentry);
-                spin_unlock(&dcache_lock);
                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(dentry);
+                /* No point expiring a pending mount */
+                if (ino->flags & AUTOFS_INF_PENDING)
+                        goto cont;
+                managed_dentry_set_transit(dentry);
                /*
                 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -347,7 +356,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 2;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        /* Can we umount this guy */
@@ -369,7 +378,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                if (!exp_leaves) {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -383,7 +392,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                } else {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
@@ -393,12 +402,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        }
                }
 next:
+                managed_dentry_clear_transit(dentry);
+cont:
                spin_unlock(&sbi->fs_lock);
-                dput(dentry);
-                spin_lock(&dcache_lock);
-                next = next->next;
        }
-        spin_unlock(&dcache_lock);
        return NULL;
 found:
@@ -408,9 +415,13 @@ found:
        ino->flags |= AUTOFS_INF_EXPIRING;
        init_completion(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&expired->d_parent->d_lock);
+        spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&expired->d_lock);
+        spin_unlock(&expired->d_parent->d_lock);
+        spin_unlock(&autofs4_lock);
        return expired;
 }
@@ -473,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb,
        spin_lock(&sbi->fs_lock);
        ino = autofs4_dentry_ino(dentry);
        ino->flags &= ~AUTOFS_INF_EXPIRING;
+        if (!d_unhashed(dentry))
+                managed_dentry_clear_transit(dentry);
        complete_all(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
@@ -498,11 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
                spin_lock(&sbi->fs_lock);
-                if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-                        sb->s_root->d_mounted++;
-                        ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
-                }
                ino->flags &= ~AUTOFS_INF_EXPIRING;
+                spin_lock(&dentry->d_lock);
+                if (ret)
+                        __managed_dentry_clear_transit(dentry);
+                else {
+                        if ((IS_ROOT(dentry) ||
+                            (autofs_type_indirect(sbi->type) &&
+                             IS_ROOT(dentry->d_parent))) &&
+                            !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                                __managed_dentry_set_automount(dentry);
+                }
+                spin_unlock(&dentry->d_lock);
                complete_all(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                dput(dentry);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 9722e4bd8957..c038727b4050 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,16 +14,16 @@
 #include <linux/init.h>
 #include "autofs_i.h"
-static int autofs_get_sb(struct file_system_type *fs_type,
+static struct dentry *autofs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, autofs4_fill_super);
 }
 static struct file_system_type autofs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "autofs",
-        .get_sb         = autofs_get_sb,
+        .mount          = autofs_mount,
        .kill_sb        = autofs4_kill_sb,
 };
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b955dac..180fa2425e49 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,77 +22,27 @@
 #include "autofs_i.h"
 #include <linux/module.h>
-static void ino_lnkfree(struct autofs_info *ino)
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
 {
-        if (ino->u.symlink) {
+        struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
-                kfree(ino->u.symlink);
+        if (ino) {
-                ino->u.symlink = NULL;
-        }
-}
-struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
-                                     struct autofs_sb_info *sbi, mode_t mode)
-{
-        int reinit = 1;
-        if (ino == NULL) {
-                reinit = 0;
-                ino = kmalloc(sizeof(*ino), GFP_KERNEL);
-        }
-        if (ino == NULL)
-                return NULL;
-        if (!reinit) {
-                ino->flags = 0;
-                ino->inode = NULL;
-                ino->dentry = NULL;
-                ino->size = 0;
                INIT_LIST_HEAD(&ino->active);
-                ino->active_count = 0;
                INIT_LIST_HEAD(&ino->expiring);
-                atomic_set(&ino->count, 0);
+                ino->last_used = jiffies;
+                ino->sbi = sbi;
        }
+        return ino;
+}
+void autofs4_clean_ino(struct autofs_info *ino)
+{
        ino->uid = 0;
        ino->gid = 0;
-        ino->mode = mode;
        ino->last_used = jiffies;
-        ino->sbi = sbi;
-        if (reinit && ino->free)
-                (ino->free)(ino);
-        memset(&ino->u, 0, sizeof(ino->u));
-        ino->free = NULL;
-        if (S_ISLNK(mode))
-                ino->free = ino_lnkfree;
-        return ino;
 }
 void autofs4_free_ino(struct autofs_info *ino)
 {
-        struct autofs_info *p_ino;
-        if (ino->dentry) {
-                ino->dentry->d_fsdata = NULL;
-                if (ino->dentry->d_inode) {
-                        struct dentry *parent = ino->dentry->d_parent;
-                        if (atomic_dec_and_test(&ino->count)) {
-                                p_ino = autofs4_dentry_ino(parent);
-                                if (p_ino && parent != ino->dentry)
-                                        atomic_dec(&p_ino->count);
-                        }
-                        dput(ino->dentry);
-                }
-                ino->dentry = NULL;
-        }
-        if (ino->free)
-                (ino->free)(ino);
        kfree(ino);
 }
@@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
+static void autofs4_evict_inode(struct inode *inode)
+{
+        end_writeback(inode);
+        kfree(inode->i_private);
+}
 static const struct super_operations autofs4_sops = {
        .statfs         = simple_statfs,
        .show_options   = autofs4_show_options,
+        .evict_inode    = autofs4_evict_inode,
 };
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
@@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
        return (*pipefd < 0);
 }
-static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
-{
-        struct autofs_info *ino;
-        ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
-        if (!ino)
-                return NULL;
-        return ino;
-}
-static const struct dentry_operations autofs4_sb_dentry_operations = {
-        .d_release      = autofs4_dentry_release,
-};
 int autofs4_fill_super(struct super_block *s, void *data, int silent)
 {
        struct inode * root_inode;
@@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        s->s_blocksize_bits = 10;
        s->s_magic = AUTOFS_SUPER_MAGIC;
        s->s_op = &autofs4_sops;
+        s->s_d_op = &autofs4_dentry_operations;
        s->s_time_gran = 1;
        /*
         * Get the root inode and dentry, but defer checking for errors.
         */
-        ino = autofs4_mkroot(sbi);
+        ino = autofs4_new_ino(sbi);
        if (!ino)
                goto fail_free;
-        root_inode = autofs4_get_inode(s, ino);
+        root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
        if (!root_inode)
                goto fail_ino;
@@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
                goto fail_iput;
        pipe = NULL;
-        root->d_op = &autofs4_sb_dentry_operations;
        root->d_fsdata = ino;
        /* Can this call block? */
@@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
                goto fail_dput;
        }
+        if (autofs_type_trigger(sbi->type))
+                __managed_dentry_set_managed(root);
        root_inode->i_fop = &autofs4_root_operations;
-        root_inode->i_op = autofs_type_trigger(sbi->type) ?
+        root_inode->i_op = &autofs4_dir_inode_operations;
-                        &autofs4_direct_root_inode_operations :
-                        &autofs4_indirect_root_inode_operations;
        /* Couldn't this be tested earlier? */
        if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
@@ -383,28 +326,26 @@ fail_unlock:
        return -EINVAL;
 }
-struct inode *autofs4_get_inode(struct super_block *sb,
+struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
-                                struct autofs_info *inf)
 {
        struct inode *inode = new_inode(sb);
        if (inode == NULL)
                return NULL;
-        inf->inode = inode;
+        inode->i_mode = mode;
-        inode->i_mode = inf->mode;
        if (sb->s_root) {
                inode->i_uid = sb->s_root->d_inode->i_uid;
                inode->i_gid = sb->s_root->d_inode->i_gid;
        }
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        inode->i_ino = get_next_ino();
-        if (S_ISDIR(inf->mode)) {
+        if (S_ISDIR(mode)) {
                inode->i_nlink = 2;
                inode->i_op = &autofs4_dir_inode_operations;
                inode->i_fop = &autofs4_dir_operations;
-        } else if (S_ISLNK(inf->mode)) {
+        } else if (S_ISLNK(mode)) {
-                inode->i_size = inf->size;
                inode->i_op = &autofs4_symlink_inode_operations;
        }
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cb1bd38dc08c..014e7aba3b08 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -19,22 +19,25 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include "autofs_i.h"
+DEFINE_SPINLOCK(autofs4_lock);
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
 static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+#ifdef CONFIG_COMPAT
 static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+#endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
-static void *autofs4_follow_link(struct dentry *, struct nameidata *);
+static struct vfsmount *autofs4_d_automount(struct path *);
+static int autofs4_d_manage(struct dentry *, bool, bool);
-#define TRIGGER_FLAGS   (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
+static void autofs4_dentry_release(struct dentry *);
-#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
 const struct file_operations autofs4_root_operations = {
        .open           = dcache_dir_open,
@@ -56,7 +59,7 @@ const struct file_operations autofs4_dir_operations = {
        .llseek         = dcache_dir_lseek,
 };
-const struct inode_operations autofs4_indirect_root_inode_operations = {
+const struct inode_operations autofs4_dir_inode_operations = {
        .lookup         = autofs4_lookup,
        .unlink         = autofs4_dir_unlink,
        .symlink        = autofs4_dir_symlink,
@@ -64,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = {
        .rmdir          = autofs4_dir_rmdir,
 };
-const struct inode_operations autofs4_direct_root_inode_operations = {
+const struct dentry_operations autofs4_dentry_operations = {
-        .lookup         = autofs4_lookup,
+        .d_automount    = autofs4_d_automount,
-        .unlink         = autofs4_dir_unlink,
+        .d_manage       = autofs4_d_manage,
-        .mkdir          = autofs4_dir_mkdir,
+        .d_release      = autofs4_dentry_release,
-        .rmdir          = autofs4_dir_rmdir,
-        .follow_link    = autofs4_follow_link,
-};
-const struct inode_operations autofs4_dir_inode_operations = {
-        .lookup         = autofs4_lookup,
-        .unlink         = autofs4_dir_unlink,
-        .symlink        = autofs4_dir_symlink,
-        .mkdir          = autofs4_dir_mkdir,
-        .rmdir          = autofs4_dir_rmdir,
 };
 static void autofs4_add_active(struct dentry *dentry)
@@ -112,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry)
        return;
 }
-static unsigned int autofs4_need_mount(unsigned int flags)
-{
-        unsigned int res = 0;
-        if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
-                res = 1;
-        return res;
-}
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
@@ -140,275 +125,41 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
         * autofs file system so just let the libfs routines handle
         * it.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&dentry->d_lock);
        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&autofs4_lock);
                return -ENOENT;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&autofs4_lock);
 out:
        return dcache_dir_open(inode, file);
 }
-static int try_to_fill_dentry(struct dentry *dentry, int flags)
+static void autofs4_dentry_release(struct dentry *de)
-{
-        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
-        int status;
-        DPRINTK("dentry=%p %.*s ino=%p",
-                 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-        /*
-         * Wait for a pending mount, triggering one if there
-         * isn't one already
-         */
-        if (dentry->d_inode == NULL) {
-                DPRINTK("waiting for mount name=%.*s",
-                         dentry->d_name.len, dentry->d_name.name);
-                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-                DPRINTK("mount done status=%d", status);
-                /* Turn this into a real negative dentry? */
-                if (status == -ENOENT) {
-                        spin_lock(&sbi->fs_lock);
-                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&sbi->fs_lock);
-                        return status;
-                } else if (status) {
-                        /* Return a negative dentry, but leave it "pending" */
-                        return status;
-                }
-        /* Trigger mount for path component or follow link */
-        } else if (ino->flags & AUTOFS_INF_PENDING ||
-                        autofs4_need_mount(flags)) {
-                DPRINTK("waiting for mount name=%.*s",
-                        dentry->d_name.len, dentry->d_name.name);
-                spin_lock(&sbi->fs_lock);
-                ino->flags |= AUTOFS_INF_PENDING;
-                spin_unlock(&sbi->fs_lock);
-                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-                DPRINTK("mount done status=%d", status);
-                if (status) {
-                        spin_lock(&sbi->fs_lock);
-                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&sbi->fs_lock);
-                        return status;
-                }
-        }
-        /* Initialize expiry counter after successful mount */
-        ino->last_used = jiffies;
-        spin_lock(&sbi->fs_lock);
-        ino->flags &= ~AUTOFS_INF_PENDING;
-        spin_unlock(&sbi->fs_lock);
-        return 0;
-}
-/* For autofs direct mounts the follow link triggers the mount */
-static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(de);
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-        int oz_mode = autofs4_oz_mode(sbi);
-        unsigned int lookup_type;
-        int status;
-        DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
-                dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
-                nd->flags);
-        /*
-         * For an expire of a covered direct or offset mount we need
-         * to break out of follow_down() at the autofs mount trigger
-         * (d_mounted--), so we can see the expiring flag, and manage
-         * the blocking and following here until the expire is completed.
-         */
-        if (oz_mode) {
-                spin_lock(&sbi->fs_lock);
-                if (ino->flags & AUTOFS_INF_EXPIRING) {
-                        spin_unlock(&sbi->fs_lock);
-                        /* Follow down to our covering mount. */
-                        if (!follow_down(&nd->path))
-                                goto done;
-                        goto follow;
-                }
-                spin_unlock(&sbi->fs_lock);
-                goto done;
-        }
-        /* If an expire request is pending everyone must wait. */
-        autofs4_expire_wait(dentry);
-        /* We trigger a mount for almost all flags */
-        lookup_type = autofs4_need_mount(nd->flags);
-        spin_lock(&sbi->fs_lock);
-        spin_lock(&dcache_lock);
-        if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
-                spin_unlock(&dcache_lock);
-                spin_unlock(&sbi->fs_lock);
-                goto follow;
-        }
-        /*
-         * If the dentry contains directories then it is an autofs
-         * multi-mount with no root mount offset. So don't try to
-         * mount it again.
-         */
-        if (ino->flags & AUTOFS_INF_PENDING ||
-            (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-                spin_unlock(&dcache_lock);
-                spin_unlock(&sbi->fs_lock);
-                status = try_to_fill_dentry(dentry, nd->flags);
-                if (status)
-                        goto out_error;
-                goto follow;
-        }
-        spin_unlock(&dcache_lock);
-        spin_unlock(&sbi->fs_lock);
-follow:
-        /*
-         * If there is no root mount it must be an autofs
-         * multi-mount with no root offset so we don't need
-         * to follow it.
-         */
-        if (d_mountpoint(dentry)) {
-                if (!autofs4_follow_mount(&nd->path)) {
-                        status = -ENOENT;
-                        goto out_error;
-                }
-        }
-done:
-        return NULL;
-out_error:
-        path_put(&nd->path);
-        return ERR_PTR(status);
-}
-/*
- * Revalidate is called on every cache lookup.  Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-        struct inode *dir = dentry->d_parent->d_inode;
-        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
-        int oz_mode = autofs4_oz_mode(sbi);
-        int flags = nd ? nd->flags : 0;
-        int status = 1;
-        /* Pending dentry */
-        spin_lock(&sbi->fs_lock);
-        if (autofs4_ispending(dentry)) {
-                /* The daemon never causes a mount to trigger */
-                spin_unlock(&sbi->fs_lock);
-                if (oz_mode)
-                        return 1;
-                /*
-                 * If the directory has gone away due to an expire
-                 * we have been called as ->d_revalidate() and so
-                 * we need to return false and proceed to ->lookup().
-                 */
-                if (autofs4_expire_wait(dentry) == -EAGAIN)
-                        return 0;
-                /*
-                 * A zero status is success otherwise we have a
-                 * negative error code.
-                 */
-                status = try_to_fill_dentry(dentry, flags);
-                if (status == 0)
-                        return 1;
-                return status;
-        }
-        spin_unlock(&sbi->fs_lock);
-        /* Negative dentry.. invalidate if "old" */
-        if (dentry->d_inode == NULL)
-                return 0;
-        /* Check for a non-mountpoint directory with no contents */
-        spin_lock(&dcache_lock);
-        if (S_ISDIR(dentry->d_inode->i_mode) &&
-            !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                DPRINTK("dentry=%p %.*s, emptydir",
-                         dentry, dentry->d_name.len, dentry->d_name.name);
-                spin_unlock(&dcache_lock);
-                /* The daemon never causes a mount to trigger */
-                if (oz_mode)
-                        return 1;
-                /*
-                 * A zero status is success otherwise we have a
-                 * negative error code.
-                 */
-                status = try_to_fill_dentry(dentry, flags);
-                if (status == 0)
-                        return 1;
-                return status;
-        }
-        spin_unlock(&dcache_lock);
-        return 1;
-}
-void autofs4_dentry_release(struct dentry *de)
-{
-        struct autofs_info *inf;
        DPRINTK("releasing %p", de);
-        inf = autofs4_dentry_ino(de);
+        if (!ino)
-        de->d_fsdata = NULL;
+                return;
-        if (inf) {
-                struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-                if (sbi) {
-                        spin_lock(&sbi->lookup_lock);
-                        if (!list_empty(&inf->active))
-                                list_del(&inf->active);
-                        if (!list_empty(&inf->expiring))
-                                list_del(&inf->expiring);
-                        spin_unlock(&sbi->lookup_lock);
-                }
-                inf->dentry = NULL;
-                inf->inode = NULL;
-                autofs4_free_ino(inf);
+        if (sbi) {
+                spin_lock(&sbi->lookup_lock);
+                if (!list_empty(&ino->active))
+                        list_del(&ino->active);
+                if (!list_empty(&ino->expiring))
+                        list_del(&ino->expiring);
+                spin_unlock(&sbi->lookup_lock);
        }
-}
-/* For dentries of directories in the root dir */
+        autofs4_free_ino(ino);
-static const struct dentry_operations autofs4_root_dentry_operations = {
+}
-        .d_revalidate   = autofs4_revalidate,
-        .d_release      = autofs4_dentry_release,
-};
-/* For other dentries */
-static const struct dentry_operations autofs4_dentry_operations = {
-        .d_revalidate   = autofs4_revalidate,
-        .d_release      = autofs4_dentry_release,
-};
 static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 {
@@ -420,7 +171,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->active_list;
        list_for_each(p, head) {
@@ -434,7 +185,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                spin_lock(&active->d_lock);
                /* Already gone? */
-                if (atomic_read(&active->d_count) == 0)
+                if (active->d_count == 0)
                        goto next;
                qstr = &active->d_name;
@@ -450,17 +201,17 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                        goto next;
                if (d_unhashed(active)) {
-                        dget(active);
+                        dget_dlock(active);
                        spin_unlock(&active->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&autofs4_lock);
                        return active;
                }
 next:
                spin_unlock(&active->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return NULL;
 }
@@ -475,7 +226,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->expiring_list;
        list_for_each(p, head) {
@@ -505,66 +256,261 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
                        goto next;
                if (d_unhashed(expiring)) {
-                        dget(expiring);
+                        dget_dlock(expiring);
                        spin_unlock(&expiring->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&autofs4_lock);
                        return expiring;
                }
 next:
                spin_unlock(&expiring->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return NULL;
 }
+static int autofs4_mount_wait(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        int status;
+        if (ino->flags & AUTOFS_INF_PENDING) {
+                DPRINTK("waiting for mount name=%.*s",
+                        dentry->d_name.len, dentry->d_name.name);
+                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+                DPRINTK("mount wait done status=%d", status);
+                ino->last_used = jiffies;
+                return status;
+        }
+        return 0;
+}
+static int do_expire_wait(struct dentry *dentry)
+{
+        struct dentry *expiring;
+        expiring = autofs4_lookup_expiring(dentry);
+        if (!expiring)
+                return autofs4_expire_wait(dentry);
+        else {
+                /*
+                 * If we are racing with expire the request might not
+                 * be quite complete, but the directory has been removed
+                 * so it must have been successful, just wait for it.
+                 */
+                autofs4_expire_wait(expiring);
+                autofs4_del_expiring(expiring);
+                dput(expiring);
+        }
+        return 0;
+}
+static struct dentry *autofs4_mountpoint_changed(struct path *path)
+{
+        struct dentry *dentry = path->dentry;
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        /*
+         * If this is an indirect mount the dentry could have gone away
+         * as a result of an expire and a new one created.
+         */
+        if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
+                struct dentry *parent = dentry->d_parent;
+                struct dentry *new = d_lookup(parent, &dentry->d_name);
+                if (!new)
+                        return NULL;
+                dput(path->dentry);
+                path->dentry = new;
+        }
+        return path->dentry;
+}
+static struct vfsmount *autofs4_d_automount(struct path *path)
+{
+        struct dentry *dentry = path->dentry;
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        int status;
+        DPRINTK("dentry=%p %.*s",
+                dentry, dentry->d_name.len, dentry->d_name.name);
+        /*
+         * Someone may have manually umounted this or it was a submount
+         * that has gone away.
+         */
+        spin_lock(&dentry->d_lock);
+        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+                if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+                     (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                        __managed_dentry_set_transit(path->dentry);
+        }
+        spin_unlock(&dentry->d_lock);
+        /* The daemon never triggers a mount. */
+        if (autofs4_oz_mode(sbi))
+                return NULL;
+        /*
+         * If an expire request is pending everyone must wait.
+         * If the expire fails we're still mounted so continue
+         * the follow and return. A return of -EAGAIN (which only
+         * happens with indirect mounts) means the expire completed
+         * and the directory was removed, so just go ahead and try
+         * the mount.
+         */
+        status = do_expire_wait(dentry);
+        if (status && status != -EAGAIN)
+                return NULL;
+        /* Callback to the daemon to perform the mount or wait */
+        spin_lock(&sbi->fs_lock);
+        if (ino->flags & AUTOFS_INF_PENDING) {
+                spin_unlock(&sbi->fs_lock);
+                status = autofs4_mount_wait(dentry);
+                if (status)
+                        return ERR_PTR(status);
+                spin_lock(&sbi->fs_lock);
+                goto done;
+        }
+        /*
+         * If the dentry is a symlink it's equivalent to a directory
+         * having d_mountpoint() true, so there's no need to call back
+         * to the daemon.
+         */
+        if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+                goto done;
+        if (!d_mountpoint(dentry)) {
+                /*
+                 * It's possible that user space hasn't removed directories
+                 * after umounting a rootless multi-mount, although it
+                 * should. For v5 have_submounts() is sufficient to handle
+                 * this because the leaves of the directory tree under the
+                 * mount never trigger mounts themselves (they have an autofs
+                 * trigger mount mounted on them). But v4 pseudo direct mounts
+                 * do need the leaves to to trigger mounts. In this case we
+                 * have no choice but to use the list_empty() check and
+                 * require user space behave.
+                 */
+                if (sbi->version > 4) {
+                        if (have_submounts(dentry))
+                                goto done;
+                } else {
+                        spin_lock(&dentry->d_lock);
+                        if (!list_empty(&dentry->d_subdirs)) {
+                                spin_unlock(&dentry->d_lock);
+                                goto done;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                }
+                ino->flags |= AUTOFS_INF_PENDING;
+                spin_unlock(&sbi->fs_lock);
+                status = autofs4_mount_wait(dentry);
+                if (status)
+                        return ERR_PTR(status);
+                spin_lock(&sbi->fs_lock);
+                ino->flags &= ~AUTOFS_INF_PENDING;
+        }
+done:
+        if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+                /*
+                 * Any needed mounting has been completed and the path updated
+                 * so turn this into a normal dentry so we don't continually
+                 * call ->d_automount() and ->d_manage().
+                 */
+                spin_lock(&dentry->d_lock);
+                __managed_dentry_clear_transit(dentry);
+                /*
+                 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
+                 * symlinks as in all other cases the dentry will be covered by
+                 * an actual mount so ->d_automount() won't be called during
+                 * the follow.
+                 */
+                if ((!d_mountpoint(dentry) &&
+                    !list_empty(&dentry->d_subdirs)) ||
+                    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+                        __managed_dentry_clear_automount(dentry);
+                spin_unlock(&dentry->d_lock);
+        }
+        spin_unlock(&sbi->fs_lock);
+        /* Mount succeeded, check if we ended up with a new dentry */
+        dentry = autofs4_mountpoint_changed(path);
+        if (!dentry)
+                return ERR_PTR(-ENOENT);
+        return NULL;
+}
+int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        DPRINTK("dentry=%p %.*s",
+                dentry, dentry->d_name.len, dentry->d_name.name);
+        /* The daemon never waits. */
+        if (autofs4_oz_mode(sbi) || mounting_here) {
+                if (!d_mountpoint(dentry))
+                        return -EISDIR;
+                return 0;
+        }
+        /* We need to sleep, so we need pathwalk to be in ref-mode */
+        if (rcu_walk)
+                return -ECHILD;
+        /* Wait for pending expires */
+        do_expire_wait(dentry);
+        /*
+         * This dentry may be under construction so wait on mount
+         * completion.
+         */
+        return autofs4_mount_wait(dentry);
+}
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
        struct autofs_sb_info *sbi;
        struct autofs_info *ino;
-        struct dentry *expiring, *active;
+        struct dentry *active;
-        int oz_mode;
-        DPRINTK("name = %.*s",
+        DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
-                dentry->d_name.len, dentry->d_name.name);
        /* File name too long to exist */
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
        sbi = autofs4_sbi(dir->i_sb);
-        oz_mode = autofs4_oz_mode(sbi);
        DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
-                 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
+                current->pid, task_pgrp_nr(current), sbi->catatonic,
+                autofs4_oz_mode(sbi));
        active = autofs4_lookup_active(dentry);
        if (active) {
-                dentry = active;
+                return active;
-                ino = autofs4_dentry_ino(dentry);
        } else {
                /*
-                 * Mark the dentry incomplete but don't hash it. We do this
+                 * A dentry that is not within the root can never trigger a
-                 * to serialize our inode creation operations (symlink and
+                 * mount operation, unless the directory already exists, so we
-                 * mkdir) which prevents deadlock during the callback to
+                 * can return fail immediately.  The daemon however does need
-                 * the daemon. Subsequent user space lookups for the same
+                 * to create directories within the file system.
-                 * dentry are placed on the wait queue while the daemon
-                 * itself is allowed passage unresticted so the create
-                 * operation itself can then hash the dentry. Finally,
-                 * we check for the hashed dentry and return the newly
-                 * hashed dentry.
                 */
-                dentry->d_op = &autofs4_root_dentry_operations;
+                if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
+                        return ERR_PTR(-ENOENT);
-                /*
+                /* Mark entries in the root as mount triggers */
-                 * And we need to ensure that the same dentry is used for
+                if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
-                 * all following lookup calls until it is hashed so that
+                        __managed_dentry_set_managed(dentry);
-                 * the dentry flags are persistent throughout the request.
-                 */
+                ino = autofs4_new_ino(sbi);
-                ino = autofs4_init_ino(NULL, sbi, 0555);
                if (!ino)
                        return ERR_PTR(-ENOMEM);
@@ -575,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                d_instantiate(dentry, NULL);
        }
-        if (!oz_mode) {
-                mutex_unlock(&dir->i_mutex);
-                expiring = autofs4_lookup_expiring(dentry);
-                if (expiring) {
-                        /*
-                         * If we are racing with expire the request might not
-                         * be quite complete but the directory has been removed
-                         * so it must have been successful, so just wait for it.
-                         */
-                        autofs4_expire_wait(expiring);
-                        autofs4_del_expiring(expiring);
-                        dput(expiring);
-                }
-                spin_lock(&sbi->fs_lock);
-                ino->flags |= AUTOFS_INF_PENDING;
-                spin_unlock(&sbi->fs_lock);
-                if (dentry->d_op && dentry->d_op->d_revalidate)
-                        (dentry->d_op->d_revalidate)(dentry, nd);
-                mutex_lock(&dir->i_mutex);
-        }
-        /*
-         * If we are still pending, check if we had to handle
-         * a signal. If so we can force a restart..
-         */
-        if (ino->flags & AUTOFS_INF_PENDING) {
-                /* See if we were interrupted */
-                if (signal_pending(current)) {
-                        sigset_t *sigset = &current->pending.signal;
-                        if (sigismember (sigset, SIGKILL) ||
-                            sigismember (sigset, SIGQUIT) ||
-                            sigismember (sigset, SIGINT)) {
-                            if (active)
-                                dput(active);
-                            return ERR_PTR(-ERESTARTNOINTR);
-                        }
-                }
-                if (!oz_mode) {
-                        spin_lock(&sbi->fs_lock);
-                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&sbi->fs_lock);
-                }
-        }
-        /*
-         * If this dentry is unhashed, then we shouldn't honour this
-         * lookup.  Returning ENOENT here doesn't do the right thing
-         * for all system calls, but it should be OK for the operations
-         * we permit from an autofs.
-         */
-        if (!oz_mode && d_unhashed(dentry)) {
-                /*
-                 * A user space application can (and has done in the past)
-                 * remove and re-create this directory during the callback.
-                 * This can leave us with an unhashed dentry, but a
-                 * successful mount!  So we need to perform another
-                 * cached lookup in case the dentry now exists.
-                 */
-                struct dentry *parent = dentry->d_parent;
-                struct dentry *new = d_lookup(parent, &dentry->d_name);
-                if (new != NULL)
-                        dentry = new;
-                else
-                        dentry = ERR_PTR(-ENOENT);
-                if (active)
-                        dput(active);
-                return dentry;
-        }
-        if (active)
-                return active;
        return NULL;
 }
@@ -662,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir,
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        struct autofs_info *p_ino;
        struct inode *inode;
+        size_t size = strlen(symname);
        char *cp;
        DPRINTK("%s <- %.*s", symname,
@@ -670,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir,
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
-        ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555);
+        BUG_ON(!ino);
-        if (!ino)
-                return -ENOMEM;
+        autofs4_clean_ino(ino);
        autofs4_del_active(dentry);
-        ino->size = strlen(symname);
+        cp = kmalloc(size + 1, GFP_KERNEL);
-        cp = kmalloc(ino->size + 1, GFP_KERNEL);
+        if (!cp)
-        if (!cp) {
-                if (!dentry->d_fsdata)
-                        kfree(ino);
                return -ENOMEM;
-        }
        strcpy(cp, symname);
-        inode = autofs4_get_inode(dir->i_sb, ino);
+        inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
        if (!inode) {
                kfree(cp);
                if (!dentry->d_fsdata)
                        kfree(ino);
                return -ENOMEM;
        }
+        inode->i_private = cp;
+        inode->i_size = size;
        d_add(dentry, inode);
-        if (dir == dir->i_sb->s_root->d_inode)
+        dget(dentry);
-                dentry->d_op = &autofs4_root_dentry_operations;
-        else
-                dentry->d_op = &autofs4_dentry_operations;
-        dentry->d_fsdata = ino;
-        ino->dentry = dget(dentry);
        atomic_inc(&ino->count);
        p_ino = autofs4_dentry_ino(dentry->d_parent);
        if (p_ino && dentry->d_parent != dentry)
                atomic_inc(&p_ino->count);
-        ino->inode = inode;
-        ino->u.symlink = cp;
        dir->i_mtime = CURRENT_TIME;
        return 0;
@@ -751,16 +612,68 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        dir->i_mtime = CURRENT_TIME;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        autofs4_add_expiring(dentry);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return 0;
 }
+/*
+ * Version 4 of autofs provides a pseudo direct mount implementation
+ * that relies on directories at the leaves of a directory tree under
+ * an indirect mount to trigger mounts. To allow for this we need to
+ * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
+ * of the directory tree. There is no need to clear the automount flag
+ * following a mount or restore it after an expire because these mounts
+ * are always covered. However, it is neccessary to ensure that these
+ * flags are clear on non-empty directories to avoid unnecessary calls
+ * during path walks.
+ */
+static void autofs_set_leaf_automount_flags(struct dentry *dentry)
+{
+        struct dentry *parent;
+        /* root and dentrys in the root are already handled */
+        if (IS_ROOT(dentry->d_parent))
+                return;
+        managed_dentry_set_managed(dentry);
+        parent = dentry->d_parent;
+        /* only consider parents below dentrys in the root */
+        if (IS_ROOT(parent->d_parent))
+                return;
+        managed_dentry_clear_managed(parent);
+        return;
+}
+static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
+{
+        struct list_head *d_child;
+        struct dentry *parent;
+        /* flags for dentrys in the root are handled elsewhere */
+        if (IS_ROOT(dentry->d_parent))
+                return;
+        managed_dentry_clear_managed(dentry);
+        parent = dentry->d_parent;
+        /* only consider parents below dentrys in the root */
+        if (IS_ROOT(parent->d_parent))
+                return;
+        d_child = &dentry->d_u.d_child;
+        /* Set parent managed if it's becoming empty */
+        if (d_child->next == &parent->d_subdirs &&
+            d_child->prev == &parent->d_subdirs)
+                managed_dentry_set_managed(parent);
+        return;
+}
 static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -773,16 +686,23 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
+        spin_lock(&dentry->d_lock);
        if (!list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&sbi->lookup_lock);
+                spin_unlock(&autofs4_lock);
                return -ENOTEMPTY;
        }
-        autofs4_add_expiring(dentry);
+        __autofs4_add_expiring(dentry);
-        spin_lock(&dentry->d_lock);
+        spin_unlock(&sbi->lookup_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
+        if (sbi->version < 5)
+                autofs_clear_leaf_automount_flags(dentry);
        if (atomic_dec_and_test(&ino->count)) {
                p_ino = autofs4_dentry_ino(dentry->d_parent);
@@ -812,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        DPRINTK("dentry %p, creating %.*s",
                dentry, dentry->d_name.len, dentry->d_name.name);
-        ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555);
+        BUG_ON(!ino);
-        if (!ino)
-                return -ENOMEM;
+        autofs4_clean_ino(ino);
        autofs4_del_active(dentry);
-        inode = autofs4_get_inode(dir->i_sb, ino);
+        inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
-        if (!inode) {
+        if (!inode)
-                if (!dentry->d_fsdata)
-                        kfree(ino);
                return -ENOMEM;
-        }
        d_add(dentry, inode);
-        if (dir == dir->i_sb->s_root->d_inode)
+        if (sbi->version < 5)
-                dentry->d_op = &autofs4_root_dentry_operations;
+                autofs_set_leaf_automount_flags(dentry);
-        else
-                dentry->d_op = &autofs4_dentry_operations;
-        dentry->d_fsdata = ino;
+        dget(dentry);
-        ino->dentry = dget(dentry);
        atomic_inc(&ino->count);
        p_ino = autofs4_dentry_ino(dentry->d_parent);
        if (p_ino && dentry->d_parent != dentry)
                atomic_inc(&p_ino->count);
-        ino->inode = inode;
        inc_nlink(dir);
        dir->i_mtime = CURRENT_TIME;
@@ -919,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
 int is_autofs4_dentry(struct dentry *dentry)
 {
        return dentry && dentry->d_inode &&
-                (dentry->d_op == &autofs4_root_dentry_operations ||
+                dentry->d_op == &autofs4_dentry_operations &&
-                 dentry->d_op == &autofs4_dentry_operations) &&
                dentry->d_fsdata != NULL;
 }
@@ -981,14 +893,8 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 static long autofs4_root_ioctl(struct file *filp,
                               unsigned int cmd, unsigned long arg)
 {
-        long ret;
        struct inode *inode = filp->f_dentry->d_inode;
+        return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-        lock_kernel();
-        ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-        unlock_kernel();
-        return ret;
 }
 #ifdef CONFIG_COMPAT
@@ -998,13 +904,11 @@ static long autofs4_root_compat_ioctl(struct file *filp,
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret;
-        lock_kernel();
        if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
        else
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
                        (unsigned long)compat_ptr(arg));
-        unlock_kernel();
        return ret;
 }
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index b4ea82934d2e..f27c094a1919 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,8 +14,7 @@
 static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        nd_set_link(nd, dentry->d_inode->i_private);
-        nd_set_link(nd, (char *)ino->u.symlink);
        return NULL;
 }
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 2341375386f8..56010056b2e6 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -186,16 +186,26 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 {
        struct dentry *root = sbi->sb->s_root;
        struct dentry *tmp;
-        char *buf = *name;
+        char *buf;
        char *p;
-        int len = 0;
+        int len;
+        unsigned seq;
-        spin_lock(&dcache_lock);
+rename_retry:
+        buf = *name;
+        len = 0;
+        seq = read_seqbegin(&rename_lock);
+        rcu_read_lock();
+        spin_lock(&autofs4_lock);
        for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
                len += tmp->d_name.len + 1;
        if (!len || --len > NAME_MAX) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&autofs4_lock);
+                rcu_read_unlock();
+                if (read_seqretry(&rename_lock, seq))
+                        goto rename_retry;
                return 0;
        }
@@ -208,7 +218,10 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
                p -= tmp->d_name.len;
                strncpy(p, tmp->d_name.name, tmp->d_name.len);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
        return len;
 }
@@ -296,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait,
         * completed while we waited on the mutex ...
         */
        if (notify == NFY_MOUNT) {
+                struct dentry *new = NULL;
+                int valid = 1;
                /*
                 * If the dentry was successfully mounted while we slept
                 * on the wait queue mutex we can return success. If it
@@ -303,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait,
                 * a multi-mount with no mount at it's base) we can
                 * continue on and create a new request.
                 */
+                if (!IS_ROOT(dentry)) {
+                        if (dentry->d_inode && d_unhashed(dentry)) {
+                                struct dentry *parent = dentry->d_parent;
+                                new = d_lookup(parent, &dentry->d_name);
+                                if (new)
+                                        dentry = new;
+                        }
+                }
                if (have_submounts(dentry))
-                        return 0;
+                        valid = 0;
+                if (new)
+                        dput(new);
+                return valid;
        }
        return 1;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f024d8aaddef..9ad2369d9e35 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -229,8 +229,11 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
        return -EIO;
 }
-static int bad_inode_permission(struct inode *inode, int mask)
+static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return -EIO;
 }
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 6cb84d896d05..27223878ba9f 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -102,22 +102,22 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
 }
 static inline befs_data_stream
-fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n)
+fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
 {
        befs_data_stream data;
        int i;
        for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
-                data.direct[i] = fsrun_to_cpu(sb, n.direct[i]);
+                data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
-        data.max_direct_range = fs64_to_cpu(sb, n.max_direct_range);
+        data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
-        data.indirect = fsrun_to_cpu(sb, n.indirect);
+        data.indirect = fsrun_to_cpu(sb, n->indirect);
-        data.max_indirect_range = fs64_to_cpu(sb, n.max_indirect_range);
+        data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
-        data.double_indirect = fsrun_to_cpu(sb, n.double_indirect);
+        data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
        data.max_double_indirect_range = fs64_to_cpu(sb,
-                                                     n.
+                                                     n->
                                                     max_double_indirect_range);
-        data.size = fs64_to_cpu(sb, n.size);
+        data.size = fs64_to_cpu(sb, n->size);
        return data;
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index dc39d2824885..b1d0c794747b 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -284,12 +284,18 @@ befs_alloc_inode(struct super_block *sb)
        return &bi->vfs_inode;
 }
-static void
+static void befs_i_callback(struct rcu_head *head)
-befs_destroy_inode(struct inode *inode)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
+static void befs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, befs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct befs_inode_info *bi = (struct befs_inode_info *) foo;
@@ -384,7 +390,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
                int num_blks;
                befs_ino->i_data.ds =
-                    fsds_to_cpu(sb, raw_inode->data.datastream);
+                    fsds_to_cpu(sb, &raw_inode->data.datastream);
                num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
                inode->i_blocks =
@@ -913,18 +919,17 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int
+static struct dentry *
-befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
+befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
-            void *data, struct vfsmount *mnt)
+            void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
-                           mnt);
 }
 static struct file_system_type befs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "befs",
-        .get_sb         = befs_get_sb,
+        .mount          = befs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,      
 };
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e052b779..685ecff3ab31 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
        inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(new, inode);
        mutex_unlock(&info->bfs_lock);
        return 0;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index c4daf0f5fc02..a8e37f81d097 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -12,7 +12,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
@@ -215,14 +214,10 @@ static void bfs_put_super(struct super_block *s)
        if (!info)
                return;
-        lock_kernel();
        mutex_destroy(&info->bfs_lock);
        kfree(info->si_imap);
        kfree(info);
        s->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -253,11 +248,18 @@ static struct inode *bfs_alloc_inode(struct super_block *sb)
        return &bi->vfs_inode;
 }
-static void bfs_destroy_inode(struct inode *inode)
+static void bfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
+static void bfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, bfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct bfs_inode_info *bi = foo;
@@ -455,16 +457,16 @@ out:
        return ret;
 }
-static int bfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *bfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
 }
 static struct file_system_type bfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "bfs",
-        .get_sb         = bfs_get_sb,
+        .mount          = bfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 535e763ab1a6..d5b640ba6cb1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm);
 #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
 static struct linux_binfmt elf_format = {
-                .module         = THIS_MODULE,
+        .module         = THIS_MODULE,
-                .load_binary    = load_elf_binary,
+        .load_binary    = load_elf_binary,
-                .load_shlib     = load_elf_library,
+        .load_shlib     = load_elf_library,
-                .core_dump      = elf_core_dump,
+        .core_dump      = elf_core_dump,
-                .min_coredump   = ELF_EXEC_PAGESIZE,
+        .min_coredump   = ELF_EXEC_PAGESIZE,
-                .hasvdso        = 1
 };
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
@@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        return 0;
 }
-#ifndef elf_map
 static unsigned long elf_map(struct file *filep, unsigned long addr,
                struct elf_phdr *eppnt, int prot, int type,
                unsigned long total_size)
@@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
        return(map_addr);
 }
-#endif /* !elf_map */
 static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
 {
        int i, first_idx = -1, last_idx = -1;
@@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                goto out;
        retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
-                             (char *)elf_phdata,size);
+                             (char *)elf_phdata, size);
        error = -EIO;
        if (retval != size) {
                if (retval < 0)
@@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                goto out;
        if (!elf_check_arch(&loc->elf_ex))
                goto out;
-        if (!bprm->file->f_op||!bprm->file->f_op->mmap)
+        if (!bprm->file->f_op || !bprm->file->f_op->mmap)
                goto out;
        /* Now read in all of the header information */
@@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                        /* There was a PT_LOAD segment with p_memsz > p_filesz
                           before this one. Map anonymous pages, if needed,
                           and clear the area.  */
-                        retval = set_brk (elf_bss + load_bias,
+                        retval = set_brk(elf_bss + load_bias,
-                                          elf_brk + load_bias);
+                                         elf_brk + load_bias);
                        if (retval) {
                                send_sig(SIGKILL, current, 0);
                                goto out_free_dentry;
@@ -800,7 +795,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         * default mmap base, as well as whatever program they
                         * might try to exec.  This is because the brk will
                         * follow the loader, and is not movable.  */
-#ifdef CONFIG_X86
+#if defined(CONFIG_X86) || defined(CONFIG_ARM)
                        load_bias = 0;
 #else
                        load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd0cc0bf9a40..1befe2ec8186 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
        struct inode * inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime =
                        current_fs_time(inode->i_sb);
@@ -576,6 +577,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 static const struct file_operations bm_entry_operations = {
        .read           = bm_entry_read,
        .write          = bm_entry_write,
+        .llseek         = default_llseek,
 };
 /* /register */
@@ -643,6 +645,7 @@ out:
 static const struct file_operations bm_register_operations = {
        .write          = bm_register_write,
+        .llseek         = noop_llseek,
 };
 /* /status */
@@ -680,6 +683,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 static const struct file_operations bm_status_operations = {
        .read           = bm_status_read,
        .write          = bm_status_write,
+        .llseek         = default_llseek,
 };
 /* Superblock handling */
@@ -702,10 +706,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
        return err;
 }
-static int bm_get_sb(struct file_system_type *fs_type,
+static struct dentry *bm_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
+        return mount_single(fs_type, flags, data, bm_fill_super);
 }
 static struct linux_binfmt misc_format = {
@@ -716,7 +720,7 @@ static struct linux_binfmt misc_format = {
 static struct file_system_type bm_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "binfmt_misc",
-        .get_sb         = bm_get_sb,
+        .mount          = bm_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4d0ff5ee27b8..e49cce234c65 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -782,7 +782,12 @@ void __init bio_integrity_init(void)
 {
        unsigned int i;
-        kintegrityd_wq = create_workqueue("kintegrityd");
+        /*
+         * kintegrityd won't block much but may burn a lot of CPU cycles.
+         * Make it highpri CPU intensive wq with max concurrency of 1.
+         */
+        kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+                                         WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
        if (!kintegrityd_wq)
                panic("Failed to create kintegrityd\n");
diff --git a/fs/bio.c b/fs/bio.c
index 8abb2dfb2e7c..4bd454fa844e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 {
        struct bio *bio;
+        if (nr_iovecs > UIO_MAXIOV)
+                return NULL;
        bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
                      gfp_mask);
        if (unlikely(!bio))
@@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd)
 static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
                                               gfp_t gfp_mask)
 {
-        struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
+        struct bio_map_data *bmd;
+        if (iov_count > UIO_MAXIOV)
+                return NULL;
+        bmd = kmalloc(sizeof(*bmd), gfp_mask);
        if (!bmd)
                return NULL;
@@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
                end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
                start = uaddr >> PAGE_SHIFT;
+                /*
+                 * Overflow, abort
+                 */
+                if (end < start)
+                        return ERR_PTR(-EINVAL);
                nr_pages += end - start;
                len += iov[i].iov_len;
        }
@@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
                unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
                unsigned long start = uaddr >> PAGE_SHIFT;
+                /*
+                 * Overflow, abort
+                 */
+                if (end < start)
+                        return ERR_PTR(-EINVAL);
                nr_pages += end - start;
                /*
                 * buffer must be aligned to at least hardsector size for now
@@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
                unsigned long start = uaddr >> PAGE_SHIFT;
                const int local_nr_pages = end - start;
                const int page_limit = cur_page + local_nr_pages;
-                
                ret = get_user_pages_fast(uaddr, local_nr_pages,
                                write_to_vm, &pages[cur_page]);
                if (ret < local_nr_pages) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 50e8c8582faa..333a7bb4cb9c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -11,7 +11,6 @@
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/major.h>
-#include <linux/smp_lock.h>
 #include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
@@ -48,6 +47,21 @@ inline struct block_device *I_BDEV(struct inode *inode)
 EXPORT_SYMBOL(I_BDEV);
+/*
+ * move the inode from it's current bdi to the a new bdi. if the inode is dirty
+ * we need to move it onto the dirty list of @dst so that the inode is always
+ * on the right list.
+ */
+static void bdev_inode_switch_bdi(struct inode *inode,
+                        struct backing_dev_info *dst)
+{
+        spin_lock(&inode_lock);
+        inode->i_data.backing_dev_info = dst;
+        if (inode->i_state & I_DIRTY)
+                list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+        spin_unlock(&inode_lock);
+}
 static sector_t max_block(struct block_device *bdev)
 {
        sector_t retval = ~((sector_t)0);
@@ -370,7 +384,7 @@ int blkdev_fsync(struct file *filp, int datasync)
         */
        mutex_unlock(&bd_inode->i_mutex);
-        error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
+        error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
        if (error == -EOPNOTSUPP)
                error = 0;
@@ -395,13 +409,20 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void bdev_destroy_inode(struct inode *inode)
+static void bdev_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct bdev_inode *bdi = BDEV_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(bdev_cachep, bdi);
 }
+static void bdev_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, bdev_i_callback);
+}
 static void init_once(void *foo)
 {
        struct bdev_inode *ei = (struct bdev_inode *) foo;
@@ -412,7 +433,7 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&bdev->bd_inodes);
        INIT_LIST_HEAD(&bdev->bd_list);
 #ifdef CONFIG_SYSFS
-        INIT_LIST_HEAD(&bdev->bd_holder_list);
+        INIT_LIST_HEAD(&bdev->bd_holder_disks);
 #endif
        inode_init_once(&ei->vfs_inode);
        /* Initialize mutex for freeze. */
@@ -449,15 +470,15 @@ static const struct super_operations bdev_sops = {
        .evict_inode = bdev_evict_inode,
 };
-static int bd_get_sb(struct file_system_type *fs_type,
+static struct dentry *bd_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
+        return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
 }
 static struct file_system_type bd_type = {
        .name           = "bdev",
-        .get_sb         = bd_get_sb,
+        .mount          = bd_mount,
        .kill_sb        = kill_anon_super,
 };
@@ -550,7 +571,7 @@ EXPORT_SYMBOL(bdget);
 */
 struct block_device *bdgrab(struct block_device *bdev)
 {
-        atomic_inc(&bdev->bd_inode->i_count);
+        ihold(bdev->bd_inode);
        return bdev;
 }
@@ -580,7 +601,7 @@ static struct block_device *bd_acquire(struct inode *inode)
        spin_lock(&bdev_lock);
        bdev = inode->i_bdev;
        if (bdev) {
-                atomic_inc(&bdev->bd_inode->i_count);
+                ihold(bdev->bd_inode);
                spin_unlock(&bdev_lock);
                return bdev;
        }
@@ -591,12 +612,12 @@ static struct block_device *bd_acquire(struct inode *inode)
                spin_lock(&bdev_lock);
                if (!inode->i_bdev) {
                        /*
-                         * We take an additional bd_inode->i_count for inode,
+                         * We take an additional reference to bd_inode,
                         * and it's released in clear_inode() of inode.
                         * So, we can access it via ->i_mapping always
                         * without igrab().
                         */
-                        atomic_inc(&bdev->bd_inode->i_count);
+                        ihold(bdev->bd_inode);
                        inode->i_bdev = bdev;
                        inode->i_mapping = bdev->bd_inode->i_mapping;
                        list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -648,7 +669,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
        else if (bdev->bd_contains == bdev)
                return true;     /* is a whole device which isn't held */
-        else if (whole->bd_holder == bd_claim)
+        else if (whole->bd_holder == bd_may_claim)
                return true;     /* is a partition of a device that is being partitioned */
        else if (whole->bd_holder != NULL)
                return false;    /* is a partition of a held device */
@@ -760,439 +781,142 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
        }
 }
-/* releases bdev_lock */
+#ifdef CONFIG_SYSFS
-static void __bd_abort_claiming(struct block_device *whole, void *holder)
+struct bd_holder_disk {
-{
+        struct list_head        list;
-        BUG_ON(whole->bd_claiming != holder);
+        struct gendisk          *disk;
-        whole->bd_claiming = NULL;
+        int                     refcnt;
-        wake_up_bit(&whole->bd_claiming, 0);
+};
-        spin_unlock(&bdev_lock);
-        bdput(whole);
-}
-/**
- * bd_abort_claiming - abort claiming a block device
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Abort a claiming block started by bd_start_claiming().  Note that
- * @whole is not the block device to be claimed but the whole device
- * returned by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_abort_claiming(struct block_device *whole, void *holder)
-{
-        spin_lock(&bdev_lock);
-        __bd_abort_claiming(whole, holder);             /* releases bdev_lock */
-}
-/* increment holders when we have a legitimate claim. requires bdev_lock */
-static void __bd_claim(struct block_device *bdev, struct block_device *whole,
-                                        void *holder)
-{
-        /* note that for a whole device bd_holders
-         * will be incremented twice, and bd_holder will
-         * be set to bd_claim before being set to holder
-         */
-        whole->bd_holders++;
-        whole->bd_holder = bd_claim;
-        bdev->bd_holders++;
-        bdev->bd_holder = holder;
-}
-/**
- * bd_finish_claiming - finish claiming a block device
- * @bdev: block device of interest (passed to bd_start_claiming())
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Finish a claiming block started by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_finish_claiming(struct block_device *bdev,
-                                struct block_device *whole, void *holder)
-{
-        spin_lock(&bdev_lock);
-        BUG_ON(!bd_may_claim(bdev, whole, holder));
-        __bd_claim(bdev, whole, holder);
-        __bd_abort_claiming(whole, holder); /* not actually an abort */
-}
-/**
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
- * bd_claim - claim a block device
+                                                  struct gendisk *disk)
- * @bdev: block device to claim
- * @holder: holder trying to claim @bdev
- *
- * Try to claim @bdev which must have been opened successfully.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * 0 if successful, -EBUSY if @bdev is already claimed.
- */
-int bd_claim(struct block_device *bdev, void *holder)
 {
-        struct block_device *whole = bdev->bd_contains;
+        struct bd_holder_disk *holder;
-        int res;
-        might_sleep();
-        spin_lock(&bdev_lock);
+        list_for_each_entry(holder, &bdev->bd_holder_disks, list)
-        res = bd_prepare_to_claim(bdev, whole, holder);
+                if (holder->disk == disk)
-        if (res == 0)
+                        return holder;
-                __bd_claim(bdev, whole, holder);
+        return NULL;
-        spin_unlock(&bdev_lock);
-        return res;
-}
-EXPORT_SYMBOL(bd_claim);
-void bd_release(struct block_device *bdev)
-{
-        spin_lock(&bdev_lock);
-        if (!--bdev->bd_contains->bd_holders)
-                bdev->bd_contains->bd_holder = NULL;
-        if (!--bdev->bd_holders)
-                bdev->bd_holder = NULL;
-        spin_unlock(&bdev_lock);
 }
-EXPORT_SYMBOL(bd_release);
-#ifdef CONFIG_SYSFS
-/*
- * Functions for bd_claim_by_kobject / bd_release_from_kobject
- *
- *     If a kobject is passed to bd_claim_by_kobject()
- *     and the kobject has a parent directory,
- *     following symlinks are created:
- *        o from the kobject to the claimed bdev
- *        o from "holders" directory of the bdev to the parent of the kobject
- *     bd_release_from_kobject() removes these symlinks.
- *
- *     Example:
- *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to
- *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
- *           /sys/block/dm-0/slaves/sda --> /sys/block/sda
- *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
- */
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
-        if (!from || !to)
-                return 0;
        return sysfs_create_link(from, to, kobject_name(to));
 }
 static void del_symlink(struct kobject *from, struct kobject *to)
 {
-        if (!from || !to)
-                return;
        sysfs_remove_link(from, kobject_name(to));
 }
-/*
- * 'struct bd_holder' contains pointers to kobjects symlinked by
- * bd_claim_by_kobject.
- * It's connected to bd_holder_list which is protected by bdev->bd_sem.
- */
-struct bd_holder {
-        struct list_head list;  /* chain of holders of the bdev */
-        int count;              /* references from the holder */
-        struct kobject *sdir;   /* holder object, e.g. "/block/dm-0/slaves" */
-        struct kobject *hdev;   /* e.g. "/block/dm-0" */
-        struct kobject *hdir;   /* e.g. "/block/sda/holders" */
-        struct kobject *sdev;   /* e.g. "/block/sda" */
-};
-/*
- * Get references of related kobjects at once.
- * Returns 1 on success. 0 on failure.
- *
- * Should call bd_holder_release_dirs() after successful use.
- */
-static int bd_holder_grab_dirs(struct block_device *bdev,
-                        struct bd_holder *bo)
-{
-        if (!bdev || !bo)
-                return 0;
-        bo->sdir = kobject_get(bo->sdir);
-        if (!bo->sdir)
-                return 0;
-        bo->hdev = kobject_get(bo->sdir->parent);
-        if (!bo->hdev)
-                goto fail_put_sdir;
-        bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
-        if (!bo->sdev)
-                goto fail_put_hdev;
-        bo->hdir = kobject_get(bdev->bd_part->holder_dir);
-        if (!bo->hdir)
-                goto fail_put_sdev;
-        return 1;
-fail_put_sdev:
-        kobject_put(bo->sdev);
-fail_put_hdev:
-        kobject_put(bo->hdev);
-fail_put_sdir:
-        kobject_put(bo->sdir);
-        return 0;
-}
-/* Put references of related kobjects at once. */
-static void bd_holder_release_dirs(struct bd_holder *bo)
-{
-        kobject_put(bo->hdir);
-        kobject_put(bo->sdev);
-        kobject_put(bo->hdev);
-        kobject_put(bo->sdir);
-}
-static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
-{
-        struct bd_holder *bo;
-        bo = kzalloc(sizeof(*bo), GFP_KERNEL);
-        if (!bo)
-                return NULL;
-        bo->count = 1;
-        bo->sdir = kobj;
-        return bo;
-}
-static void free_bd_holder(struct bd_holder *bo)
-{
-        kfree(bo);
-}
 /**
- * find_bd_holder - find matching struct bd_holder from the block device
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
 *
- * @bdev:       struct block device to be searched
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- * @bo:         target struct bd_holder
 *
- * Returns matching entry with @bo in @bdev->bd_holder_list.
+ * This functions creates the following sysfs symlinks.
- * If found, increment the reference count and return the pointer.
+ *
- * If not found, returns NULL.
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
- */
+ * - from "holders" directory of the @bdev to the holder @disk
-static struct bd_holder *find_bd_holder(struct block_device *bdev,
-                                        struct bd_holder *bo)
-{
-        struct bd_holder *tmp;
-        list_for_each_entry(tmp, &bdev->bd_holder_list, list)
-                if (tmp->sdir == bo->sdir) {
-                        tmp->count++;
-                        return tmp;
-                }
-        return NULL;
-}
-/**
- * add_bd_holder - create sysfs symlinks for bd_claim() relationship
 *
- * @bdev:       block device to be bd_claimed
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
- * @bo:         preallocated and initialized by alloc_bd_holder()
+ * passed to bd_link_disk_holder(), then:
 *
- * Add @bo to @bdev->bd_holder_list, create symlinks.
+ *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
 *
- * Returns 0 if symlinks are created.
+ * The caller must have claimed @bdev before calling this function and
- * Returns -ve if something fails.
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
 */
-static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
-        int err;
+        struct bd_holder_disk *holder;
+        int ret = 0;
-        if (!bo)
+        mutex_lock(&bdev->bd_mutex);
-                return -EINVAL;
-        if (!bd_holder_grab_dirs(bdev, bo))
+        WARN_ON_ONCE(!bdev->bd_holder);
-                return -EBUSY;
-        err = add_symlink(bo->sdir, bo->sdev);
+        /* FIXME: remove the following once add_disk() handles errors */
-        if (err)
+        if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
-                return err;
+                goto out_unlock;
-        err = add_symlink(bo->hdir, bo->hdev);
+        holder = bd_find_holder_disk(bdev, disk);
-        if (err) {
+        if (holder) {
-                del_symlink(bo->sdir, bo->sdev);
+                holder->refcnt++;
-                return err;
+                goto out_unlock;
        }
-        list_add_tail(&bo->list, &bdev->bd_holder_list);
+        holder = kzalloc(sizeof(*holder), GFP_KERNEL);
-        return 0;
+        if (!holder) {
-}
+                ret = -ENOMEM;
+                goto out_unlock;
-/**
- * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
- *
- * @bdev:       block device to be bd_claimed
- * @kobj:       holder's kobject
- *
- * If there is matching entry with @kobj in @bdev->bd_holder_list
- * and no other bd_claim() from the same kobject,
- * remove the struct bd_holder from the list, delete symlinks for it.
- *
- * Returns a pointer to the struct bd_holder when it's removed from the list
- * and ready to be freed.
- * Returns NULL if matching claim isn't found or there is other bd_claim()
- * by the same kobject.
- */
-static struct bd_holder *del_bd_holder(struct block_device *bdev,
-                                        struct kobject *kobj)
-{
-        struct bd_holder *bo;
-        list_for_each_entry(bo, &bdev->bd_holder_list, list) {
-                if (bo->sdir == kobj) {
-                        bo->count--;
-                        BUG_ON(bo->count < 0);
-                        if (!bo->count) {
-                                list_del(&bo->list);
-                                del_symlink(bo->sdir, bo->sdev);
-                                del_symlink(bo->hdir, bo->hdev);
-                                bd_holder_release_dirs(bo);
-                                return bo;
-                        }
-                        break;
-                }
        }
-        return NULL;
+        INIT_LIST_HEAD(&holder->list);
-}
+        holder->disk = disk;
+        holder->refcnt = 1;
-/**
+        ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
- * bd_claim_by_kobject - bd_claim() with additional kobject signature
+        if (ret)
- *
+                goto out_free;
- * @bdev:       block device to be claimed
- * @holder:     holder's signature
- * @kobj:       holder's kobject
- *
- * Do bd_claim() and if it succeeds, create sysfs symlinks between
- * the bdev and the holder's kobject.
- * Use bd_release_from_kobject() when relesing the claimed bdev.
- *
- * Returns 0 on success. (same as bd_claim())
- * Returns errno on failure.
- */
-static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
-                                struct kobject *kobj)
-{
-        int err;
-        struct bd_holder *bo, *found;
-        if (!kobj)
+        ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
-                return -EINVAL;
+        if (ret)
+                goto out_del;
-        bo = alloc_bd_holder(kobj);
-        if (!bo)
-                return -ENOMEM;
-        mutex_lock(&bdev->bd_mutex);
-        err = bd_claim(bdev, holder);
+        list_add(&holder->list, &bdev->bd_holder_disks);
-        if (err)
+        goto out_unlock;
-                goto fail;
-        found = find_bd_holder(bdev, bo);
+out_del:
-        if (found)
+        del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
-                goto fail;
+out_free:
+        kfree(holder);
-        err = add_bd_holder(bdev, bo);
+out_unlock:
-        if (err)
-                bd_release(bdev);
-        else
-                bo = NULL;
-fail:
        mutex_unlock(&bdev->bd_mutex);
-        free_bd_holder(bo);
+        return ret;
-        return err;
 }
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 /**
- * bd_release_from_kobject - bd_release() with additional kobject signature
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
 *
- * @bdev:       block device to be released
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- * @kobj:       holder's kobject
 *
- * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
+ * CONTEXT:
+ * Might sleep.
 */
-static void bd_release_from_kobject(struct block_device *bdev,
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
-                                        struct kobject *kobj)
 {
-        if (!kobj)
+        struct bd_holder_disk *holder;
-                return;
        mutex_lock(&bdev->bd_mutex);
-        bd_release(bdev);
-        free_bd_holder(del_bd_holder(bdev, kobj));
-        mutex_unlock(&bdev->bd_mutex);
-}
-/**
+        holder = bd_find_holder_disk(bdev, disk);
- * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
- *
- * @bdev:       block device to be claimed
- * @holder:     holder's signature
- * @disk:       holder's gendisk
- *
- * Call bd_claim_by_kobject() with getting @disk->slave_dir.
- */
-int bd_claim_by_disk(struct block_device *bdev, void *holder,
-                        struct gendisk *disk)
-{
-        return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
-}
-EXPORT_SYMBOL_GPL(bd_claim_by_disk);
-/**
+        if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
- * bd_release_from_disk - wrapper function for bd_release_from_kobject()
+                del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
- *
+                del_symlink(bdev->bd_part->holder_dir,
- * @bdev:       block device to be claimed
+                            &disk_to_dev(disk)->kobj);
- * @disk:       holder's gendisk
+                list_del_init(&holder->list);
- *
+                kfree(holder);
- * Call bd_release_from_kobject() and put @disk->slave_dir.
+        }
- */
-void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
-{
-        bd_release_from_kobject(bdev, disk->slave_dir);
-        kobject_put(disk->slave_dir);
-}
-EXPORT_SYMBOL_GPL(bd_release_from_disk);
-#endif
-/*
+        mutex_unlock(&bdev->bd_mutex);
- * Tries to open block device by device number.  Use it ONLY if you
- * really do not have anything better - i.e. when you are behind a
- * truly sucky interface and all you are given is a device number.  _Never_
- * to be used for internal purposes.  If you ever need it - reconsider
- * your API.
- */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
-{
-        struct block_device *bdev = bdget(dev);
-        int err = -ENOMEM;
-        if (bdev)
-                err = blkdev_get(bdev, mode);
-        return err ? ERR_PTR(err) : bdev;
 }
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
-EXPORT_SYMBOL(open_by_devnum);
+#endif
 /**
 * flush_disk - invalidates all buffer-cache entries on a disk
@@ -1288,10 +1012,11 @@ int check_disk_change(struct block_device *bdev)
 {
        struct gendisk *disk = bdev->bd_disk;
        const struct block_device_operations *bdops = disk->fops;
+        unsigned int events;
-        if (!bdops->media_changed)
+        events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
-                return 0;
+                                   DISK_EVENT_EJECT_REQUEST);
-        if (!bdops->media_changed(bdev->bd_disk))
+        if (!(events & DISK_EVENT_MEDIA_CHANGE))
                return 0;
        flush_disk(bdev);
@@ -1390,7 +1115,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                bdi = blk_get_backing_dev_info(bdev);
                                if (bdi == NULL)
                                        bdi = &default_backing_dev_info;
-                                bdev->bd_inode->i_data.backing_dev_info = bdi;
+                                bdev_inode_switch_bdi(bdev->bd_inode, bdi);
                        }
                        if (bdev->bd_invalidated)
                                rescan_partitions(disk, bdev);
@@ -1405,8 +1130,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (ret)
                                goto out_clear;
                        bdev->bd_contains = whole;
-                        bdev->bd_inode->i_data.backing_dev_info =
+                        bdev_inode_switch_bdi(bdev->bd_inode,
-                           whole->bd_inode->i_data.backing_dev_info;
+                                whole->bd_inode->i_data.backing_dev_info);
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!(disk->flags & GENHD_FL_UP) ||
                            !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1439,7 +1164,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        disk_put_part(bdev->bd_part);
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
-        bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+        bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
        bdev->bd_contains = NULL;
@@ -1454,17 +1179,171 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        return ret;
 }
-int blkdev_get(struct block_device *bdev, fmode_t mode)
+/**
+ * blkdev_get - open a block device
+ * @bdev: block_device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
+ * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
+ * @holder is invalid.  Exclusive opens may nest for the same @holder.
+ *
+ * On success, the reference count of @bdev is unchanged.  On failure,
+ * @bdev is put.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
-        return __blkdev_get(bdev, mode, 0);
+        struct block_device *whole = NULL;
+        int res;
+        WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+        if ((mode & FMODE_EXCL) && holder) {
+                whole = bd_start_claiming(bdev, holder);
+                if (IS_ERR(whole)) {
+                        bdput(bdev);
+                        return PTR_ERR(whole);
+                }
+        }
+        res = __blkdev_get(bdev, mode, 0);
+        /* __blkdev_get() may alter read only status, check it afterwards */
+        if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+                __blkdev_put(bdev, mode, 0);
+                res = -EACCES;
+        }
+        if (whole) {
+                /* finish claiming */
+                mutex_lock(&bdev->bd_mutex);
+                spin_lock(&bdev_lock);
+                if (!res) {
+                        BUG_ON(!bd_may_claim(bdev, whole, holder));
+                        /*
+                         * Note that for a whole device bd_holders
+                         * will be incremented twice, and bd_holder
+                         * will be set to bd_may_claim before being
+                         * set to holder
+                         */
+                        whole->bd_holders++;
+                        whole->bd_holder = bd_may_claim;
+                        bdev->bd_holders++;
+                        bdev->bd_holder = holder;
+                }
+                /* tell others that we're done */
+                BUG_ON(whole->bd_claiming != holder);
+                whole->bd_claiming = NULL;
+                wake_up_bit(&whole->bd_claiming, 0);
+                spin_unlock(&bdev_lock);
+                /*
+                 * Block event polling for write claims.  Any write
+                 * holder makes the write_holder state stick until all
+                 * are released.  This is good enough and tracking
+                 * individual writeable reference is too fragile given
+                 * the way @mode is used in blkdev_get/put().
+                 */
+                if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+                        bdev->bd_write_holder = true;
+                        disk_block_events(bdev->bd_disk);
+                }
+                mutex_unlock(&bdev->bd_mutex);
+                bdput(whole);
+        }
+        return res;
 }
 EXPORT_SYMBOL(blkdev_get);
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by the device file at @path.  @mode
+ * and @holder are identical to blkdev_get().
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+                                        void *holder)
+{
+        struct block_device *bdev;
+        int err;
+        bdev = lookup_bdev(path);
+        if (IS_ERR(bdev))
+                return bdev;
+        err = blkdev_get(bdev, mode, holder);
+        if (err)
+                return ERR_PTR(err);
+        return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by device number @dev.  @mode and
+ * @holder are identical to blkdev_get().
+ *
+ * Use it ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a
+ * device number.  _Never_ to be used for internal purposes.  If you
+ * ever need it - reconsider your API.
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+        struct block_device *bdev;
+        int err;
+        bdev = bdget(dev);
+        if (!bdev)
+                return ERR_PTR(-ENOMEM);
+        err = blkdev_get(bdev, mode, holder);
+        if (err)
+                return ERR_PTR(err);
+        return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
-        struct block_device *whole = NULL;
        struct block_device *bdev;
-        int res;
        /*
         * Preserve backwards compatibility and allow large file access
@@ -1485,26 +1364,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
        if (bdev == NULL)
                return -ENOMEM;
-        if (filp->f_mode & FMODE_EXCL) {
-                whole = bd_start_claiming(bdev, filp);
-                if (IS_ERR(whole)) {
-                        bdput(bdev);
-                        return PTR_ERR(whole);
-                }
-        }
        filp->f_mapping = bdev->bd_inode->i_mapping;
-        res = blkdev_get(bdev, filp->f_mode);
+        return blkdev_get(bdev, filp->f_mode, filp);
-        if (whole) {
-                if (res == 0)
-                        bd_finish_claiming(bdev, whole, filp);
-                else
-                        bd_abort_claiming(whole, filp);
-        }
-        return res;
 }
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1518,6 +1380,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                bdev->bd_part_count--;
        if (!--bdev->bd_openers) {
+                WARN_ON_ONCE(bdev->bd_holders);
                sync_blockdev(bdev);
                kill_bdev(bdev);
        }
@@ -1533,7 +1396,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                disk_put_part(bdev->bd_part);
                bdev->bd_part = NULL;
                bdev->bd_disk = NULL;
-                bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+                bdev_inode_switch_bdi(bdev->bd_inode,
+                                        &default_backing_dev_info);
                if (bdev != bdev->bd_contains)
                        victim = bdev->bd_contains;
                bdev->bd_contains = NULL;
@@ -1547,6 +1411,44 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
+        if (mode & FMODE_EXCL) {
+                bool bdev_free;
+                /*
+                 * Release a claim on the device.  The holder fields
+                 * are protected with bdev_lock.  bd_mutex is to
+                 * synchronize disk_holder unlinking.
+                 */
+                mutex_lock(&bdev->bd_mutex);
+                spin_lock(&bdev_lock);
+                WARN_ON_ONCE(--bdev->bd_holders < 0);
+                WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
+                /* bd_contains might point to self, check in a separate step */
+                if ((bdev_free = !bdev->bd_holders))
+                        bdev->bd_holder = NULL;
+                if (!bdev->bd_contains->bd_holders)
+                        bdev->bd_contains->bd_holder = NULL;
+                spin_unlock(&bdev_lock);
+                /*
+                 * If this was the last claim, remove holder link and
+                 * unblock evpoll if it was a write holder.
+                 */
+                if (bdev_free) {
+                        if (bdev->bd_write_holder) {
+                                disk_unblock_events(bdev->bd_disk);
+                                bdev->bd_write_holder = false;
+                        } else
+                                disk_check_events(bdev->bd_disk);
+                }
+                mutex_unlock(&bdev->bd_mutex);
+        } else
+                disk_check_events(bdev->bd_disk);
        return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
@@ -1554,8 +1456,7 @@ EXPORT_SYMBOL(blkdev_put);
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-        if (bdev->bd_holder == filp)
-                bd_release(bdev);
        return blkdev_put(bdev, filp->f_mode);
 }
@@ -1700,67 +1601,6 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
-/**
- * open_bdev_exclusive  -  open a block device by name and set it up for use
- *
- * @path:       special file representing the block device
- * @mode:       FMODE_... combination to pass be used
- * @holder:     owner for exclusion
- *
- * Open the blockdevice described by the special file at @path, claim it
- * for the @holder.
- */
-struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
-{
-        struct block_device *bdev, *whole;
-        int error;
-        bdev = lookup_bdev(path);
-        if (IS_ERR(bdev))
-                return bdev;
-        whole = bd_start_claiming(bdev, holder);
-        if (IS_ERR(whole)) {
-                bdput(bdev);
-                return whole;
-        }
-        error = blkdev_get(bdev, mode);
-        if (error)
-                goto out_abort_claiming;
-        error = -EACCES;
-        if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-                goto out_blkdev_put;
-        bd_finish_claiming(bdev, whole, holder);
-        return bdev;
-out_blkdev_put:
-        blkdev_put(bdev, mode);
-out_abort_claiming:
-        bd_abort_claiming(whole, holder);
-        return ERR_PTR(error);
-}
-EXPORT_SYMBOL(open_bdev_exclusive);
-/**
- * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
- *
- * @bdev:       blockdevice to close
- * @mode:       mode, must match that used to open.
- *
- * This is the counterpart to open_bdev_exclusive().
- */
-void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
-{
-        bd_release(bdev);
-        blkdev_put(bdev, mode);
-}
-EXPORT_SYMBOL(close_bdev_exclusive);
 int __invalidate_device(struct block_device *bdev)
 {
        struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 889ce1348e64..9c949348510b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -193,18 +193,23 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        return ret;
 }
-int btrfs_check_acl(struct inode *inode, int mask)
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl;
        int error = -EAGAIN;
-        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        error = -ECHILD;
-        if (IS_ERR(acl))
+        } else {
-                return PTR_ERR(acl);
+                struct posix_acl *acl;
-        if (acl) {
+                acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-                error = posix_acl_permission(inode, acl, mask);
+                if (IS_ERR(acl))
-                posix_acl_release(acl);
+                        return PTR_ERR(acl);
+                if (acl) {
+                        error = posix_acl_permission(inode, acl, mask);
+                        posix_acl_release(acl);
+                }
        }
        return error;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 72195378bef9..2c98b3af6052 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2583,7 +2583,7 @@ do {								\
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 #else
 #define btrfs_check_acl NULL
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b36eeef19194..fdce8799b98d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2127,7 +2127,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+                if (printk_ratelimit()) {
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
                                       bdevname(bh->b_bdev, b));
@@ -2264,21 +2264,10 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
-                if (i == last_barrier && do_barriers && device->barriers) {
+                if (i == last_barrier && do_barriers)
-                        ret = submit_bh(WRITE_BARRIER, bh);
+                        ret = submit_bh(WRITE_FLUSH_FUA, bh);
-                        if (ret == -EOPNOTSUPP) {
+                else
-                                printk("btrfs: disabling barriers on dev %s\n",
-                                       device->name);
-                                set_buffer_uptodate(bh);
-                                device->barriers = 0;
-                                /* one reference for submit_bh */
-                                get_bh(bh);
-                                lock_buffer(bh);
-                                ret = submit_bh(WRITE_SYNC, bh);
-                        }
-                } else {
                        ret = submit_bh(WRITE_SYNC, bh);
-                }
                if (ret)
                        errors++;
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 3220ad1aafc8..ff27d7a477b2 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
        struct btrfs_root *root;
-        struct dentry *dentry;
        struct inode *inode;
        struct btrfs_key key;
        int index;
@@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
                return ERR_PTR(-ESTALE);
        }
-        dentry = d_obtain_alias(inode);
+        return d_obtain_alias(inode);
-        if (!IS_ERR(dentry))
-                dentry->d_op = &btrfs_dentry_operations;
-        return dentry;
 fail:
        srcu_read_unlock(&fs_info->subvol_srcu, index);
        return ERR_PTR(err);
@@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 static struct dentry *btrfs_get_parent(struct dentry *child)
 {
        struct inode *dir = child->d_inode;
-        static struct dentry *dentry;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -225,10 +220,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-        dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
-        if (!IS_ERR(dentry))
-                dentry->d_op = &btrfs_dentry_operations;
-        return dentry;
 fail:
        btrfs_free_path(path);
        return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 565e22d77b1b..4e7e012ad667 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1743,8 +1743,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
-        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
-                        BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8862dda46ff6..5e76a474cb7e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3083,7 +3083,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        eb->len = len;
        spin_lock_init(&eb->lock);
        init_waitqueue_head(&eb->lock_wq);
-        INIT_RCU_HEAD(&eb->rcu_head);
 #if LEAK_DEBUG
        spin_lock_irqsave(&leak_lock, flags);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b0ff34b96607..c1d3a818731a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
@@ -1258,6 +1259,117 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
        return 0;
 }
+static long btrfs_fallocate(struct file *file, int mode,
+                            loff_t offset, loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct extent_state *cached_state = NULL;
+        u64 cur_offset;
+        u64 last_byte;
+        u64 alloc_start;
+        u64 alloc_end;
+        u64 alloc_hint = 0;
+        u64 locked_end;
+        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        struct extent_map *em;
+        int ret;
+        alloc_start = offset & ~mask;
+        alloc_end =  (offset + len + mask) & ~mask;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
+        /*
+         * wait for ordered IO before we have any locks.  We'll loop again
+         * below with the locks held.
+         */
+        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, alloc_end);
+        if (ret)
+                goto out;
+        if (alloc_start > inode->i_size) {
+                ret = btrfs_cont_expand(inode, alloc_start);
+                if (ret)
+                        goto out;
+        }
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+        if (ret)
+                goto out;
+        locked_end = alloc_end - 1;
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                /* the extent lock is ordered inside the running
+                 * transaction
+                 */
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+                                 locked_end, 0, &cached_state, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                            alloc_end - 1);
+                if (ordered &&
+                    ordered->file_offset + ordered->len > alloc_start &&
+                    ordered->file_offset < alloc_end) {
+                        btrfs_put_ordered_extent(ordered);
+                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                             alloc_start, locked_end,
+                                             &cached_state, GFP_NOFS);
+                        /*
+                         * we can't wait on the range with the transaction
+                         * running or with the extent lock held
+                         */
+                        btrfs_wait_ordered_range(inode, alloc_start,
+                                                 alloc_end - alloc_start);
+                } else {
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+        }
+        cur_offset = alloc_start;
+        while (1) {
+                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+                                      alloc_end - cur_offset, 0);
+                BUG_ON(IS_ERR(em) || !em);
+                last_byte = min(extent_map_end(em), alloc_end);
+                last_byte = (last_byte + mask) & ~mask;
+                if (em->block_start == EXTENT_MAP_HOLE ||
+                    (cur_offset >= inode->i_size &&
+                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+                                                        last_byte - cur_offset,
+                                                        1 << inode->i_blkbits,
+                                                        offset + len,
+                                                        &alloc_hint);
+                        if (ret < 0) {
+                                free_extent_map(em);
+                                break;
+                        }
+                }
+                free_extent_map(em);
+                cur_offset = last_byte;
+                if (cur_offset >= alloc_end) {
+                        ret = 0;
+                        break;
+                }
+        }
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                             &cached_state, GFP_NOFS);
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 const struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -1269,6 +1381,7 @@ const struct file_operations btrfs_file_operations = {
        .open           = generic_file_open,
        .release        = btrfs_release_file,
        .fsync          = btrfs_sync_file,
+        .fallocate      = btrfs_fallocate,
        .unlocked_ioctl = btrfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c9bc0afdbfc6..bcc461a9695f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3902,7 +3902,7 @@ again:
        p = &root->inode_tree.rb_node;
        parent = NULL;
-        if (hlist_unhashed(&inode->i_hash))
+        if (inode_unhashed(inode))
                return;
        spin_lock(&root->inode_lock);
@@ -4109,8 +4109,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        int index;
        int ret;
-        dentry->d_op = &btrfs_dentry_operations;
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -4152,7 +4150,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        return inode;
 }
-static int btrfs_dentry_delete(struct dentry *dentry)
+static int btrfs_dentry_delete(const struct dentry *dentry)
 {
        struct btrfs_root *root;
@@ -4830,7 +4828,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        }
        btrfs_set_trans_block_group(trans, dir);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
@@ -6530,6 +6528,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        return inode;
 }
+static void btrfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
 void btrfs_destroy_inode(struct inode *inode)
 {
        struct btrfs_ordered_extent *ordered;
@@ -6599,7 +6604,7 @@ void btrfs_destroy_inode(struct inode *inode)
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+        call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 int btrfs_drop_inode(struct inode *inode)
@@ -7128,118 +7133,12 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
                                           min_size, actual_len, alloc_hint, trans);
 }
-static long btrfs_fallocate(struct inode *inode, int mode,
-                            loff_t offset, loff_t len)
-{
-        struct extent_state *cached_state = NULL;
-        u64 cur_offset;
-        u64 last_byte;
-        u64 alloc_start;
-        u64 alloc_end;
-        u64 alloc_hint = 0;
-        u64 locked_end;
-        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-        struct extent_map *em;
-        int ret;
-        alloc_start = offset & ~mask;
-        alloc_end =  (offset + len + mask) & ~mask;
-        /*
-         * wait for ordered IO before we have any locks.  We'll loop again
-         * below with the locks held.
-         */
-        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-        mutex_lock(&inode->i_mutex);
-        ret = inode_newsize_ok(inode, alloc_end);
-        if (ret)
-                goto out;
-        if (alloc_start > inode->i_size) {
-                ret = btrfs_cont_expand(inode, alloc_start);
-                if (ret)
-                        goto out;
-        }
-        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-        if (ret)
-                goto out;
-        locked_end = alloc_end - 1;
-        while (1) {
-                struct btrfs_ordered_extent *ordered;
-                /* the extent lock is ordered inside the running
-                 * transaction
-                 */
-                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-                                 locked_end, 0, &cached_state, GFP_NOFS);
-                ordered = btrfs_lookup_first_ordered_extent(inode,
-                                                            alloc_end - 1);
-                if (ordered &&
-                    ordered->file_offset + ordered->len > alloc_start &&
-                    ordered->file_offset < alloc_end) {
-                        btrfs_put_ordered_extent(ordered);
-                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                             alloc_start, locked_end,
-                                             &cached_state, GFP_NOFS);
-                        /*
-                         * we can't wait on the range with the transaction
-                         * running or with the extent lock held
-                         */
-                        btrfs_wait_ordered_range(inode, alloc_start,
-                                                 alloc_end - alloc_start);
-                } else {
-                        if (ordered)
-                                btrfs_put_ordered_extent(ordered);
-                        break;
-                }
-        }
-        cur_offset = alloc_start;
-        while (1) {
-                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
-                                      alloc_end - cur_offset, 0);
-                BUG_ON(IS_ERR(em) || !em);
-                last_byte = min(extent_map_end(em), alloc_end);
-                last_byte = (last_byte + mask) & ~mask;
-                if (em->block_start == EXTENT_MAP_HOLE ||
-                    (cur_offset >= inode->i_size &&
-                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-                                                        last_byte - cur_offset,
-                                                        1 << inode->i_blkbits,
-                                                        offset + len,
-                                                        &alloc_hint);
-                        if (ret < 0) {
-                                free_extent_map(em);
-                                break;
-                        }
-                }
-                free_extent_map(em);
-                cur_offset = last_byte;
-                if (cur_offset >= alloc_end) {
-                        ret = 0;
-                        break;
-                }
-        }
-        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-                             &cached_state, GFP_NOFS);
-        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-out:
-        mutex_unlock(&inode->i_mutex);
-        return ret;
-}
 static int btrfs_set_page_dirty(struct page *page)
 {
        return __set_page_dirty_nobuffers(page);
 }
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -7247,7 +7146,7 @@ static int btrfs_permission(struct inode *inode, int mask)
                return -EROFS;
        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
-        return generic_permission(inode, mask, btrfs_check_acl);
+        return generic_permission(inode, mask, flags, btrfs_check_acl);
 }
 static const struct inode_operations btrfs_dir_inode_operations = {
@@ -7340,7 +7239,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
-        .fallocate      = btrfs_fallocate,
        .fiemap         = btrfs_fiemap,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0209b5fc772c..a004008f7d28 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -567,6 +567,7 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_magic = BTRFS_SUPER_MAGIC;
        sb->s_op = &btrfs_super_ops;
+        sb->s_d_op = &btrfs_dentry_operations;
        sb->s_export_op = &btrfs_export_ops;
        sb->s_xattr = btrfs_xattr_handlers;
        sb->s_time_gran = 1;
@@ -698,8 +699,8 @@ static int btrfs_set_super(struct super_block *s, void *data)
 * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
 *        for multiple device setup.  Make sure to keep it in sync.
 */
-static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
-                const char *dev_name, void *data, struct vfsmount *mnt)
+                const char *dev_name, void *data)
 {
        struct block_device *bdev = NULL;
        struct super_block *s;
@@ -719,7 +720,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                                          &subvol_name, &subvol_objectid,
                                          &fs_devices);
        if (error)
-                return error;
+                return ERR_PTR(error);
        error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
        if (error)
@@ -812,11 +813,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                root = new_root;
        }
-        mnt->mnt_sb = s;
-        mnt->mnt_root = root;
        kfree(subvol_name);
-        return 0;
+        return root;
 error_s:
        error = PTR_ERR(s);
@@ -826,7 +824,7 @@ error_close_devices:
        kfree(tree_root);
 error_free_subvol_name:
        kfree(subvol_name);
-        return error;
+        return ERR_PTR(error);
 }
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1043,7 +1041,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
-        .get_sb         = btrfs_get_sb,
+        .mount          = btrfs_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -1112,6 +1110,7 @@ static const struct file_operations btrfs_ctl_fops = {
        .unlocked_ioctl  = btrfs_control_ioctl,
        .compat_ioctl = btrfs_control_ioctl,
        .owner   = THIS_MODULE,
+        .llseek = noop_llseek,
 };
 static struct miscdevice btrfs_misc = {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7cad59353b09..2636a051e4b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -399,7 +399,6 @@ static noinline int device_list_add(const char *path,
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, disk_super->dev_item.uuid,
                       BTRFS_UUID_SIZE);
-                device->barriers = 1;
                spin_lock_init(&device->io_lock);
                device->name = kstrdup(path, GFP_NOFS);
                if (!device->name) {
@@ -467,7 +466,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                device->devid = orig_dev->devid;
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
-                device->barriers = 1;
                spin_lock_init(&device->io_lock);
                INIT_LIST_HEAD(&device->dev_list);
                INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -496,7 +494,7 @@ again:
                        continue;
                if (device->bdev) {
-                        close_bdev_exclusive(device->bdev, device->mode);
+                        blkdev_put(device->bdev, device->mode);
                        device->bdev = NULL;
                        fs_devices->open_devices--;
                }
@@ -530,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                if (device->bdev) {
-                        close_bdev_exclusive(device->bdev, device->mode);
+                        blkdev_put(device->bdev, device->mode);
                        fs_devices->open_devices--;
                }
                if (device->writeable) {
@@ -587,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
        int seeding = 1;
        int ret = 0;
+        flags |= FMODE_EXCL;
        list_for_each_entry(device, head, dev_list) {
                if (device->bdev)
                        continue;
                if (!device->name)
                        continue;
-                bdev = open_bdev_exclusive(device->name, flags, holder);
+                bdev = blkdev_get_by_path(device->name, flags, holder);
                if (IS_ERR(bdev)) {
                        printk(KERN_INFO "open %s failed\n", device->name);
                        goto error;
@@ -647,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
                brelse(bh);
 error_close:
-                close_bdev_exclusive(bdev, FMODE_READ);
+                blkdev_put(bdev, flags);
 error:
                continue;
        }
@@ -693,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        mutex_lock(&uuid_mutex);
-        bdev = open_bdev_exclusive(path, flags, holder);
+        flags |= FMODE_EXCL;
+        bdev = blkdev_get_by_path(path, flags, holder);
        if (IS_ERR(bdev)) {
                ret = PTR_ERR(bdev);
@@ -725,7 +726,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        brelse(bh);
 error_close:
-        close_bdev_exclusive(bdev, flags);
+        blkdev_put(bdev, flags);
 error:
        mutex_unlock(&uuid_mutex);
        return ret;
@@ -1299,8 +1300,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                        goto out;
                }
        } else {
-                bdev = open_bdev_exclusive(device_path, FMODE_READ,
+                bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
-                                      root->fs_info->bdev_holder);
+                                          root->fs_info->bdev_holder);
                if (IS_ERR(bdev)) {
                        ret = PTR_ERR(bdev);
                        goto out;
@@ -1367,7 +1368,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                root->fs_info->fs_devices->latest_bdev = next_device->bdev;
        if (device->bdev) {
-                close_bdev_exclusive(device->bdev, device->mode);
+                blkdev_put(device->bdev, device->mode);
                device->bdev = NULL;
                device->fs_devices->open_devices--;
        }
@@ -1410,7 +1411,7 @@ error_brelse:
        brelse(bh);
 error_close:
        if (bdev)
-                close_bdev_exclusive(bdev, FMODE_READ);
+                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
@@ -1562,7 +1563,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
                return -EINVAL;
-        bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+        bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+                                  root->fs_info->bdev_holder);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
@@ -1616,7 +1618,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        lock_chunks(root);
-        device->barriers = 1;
        device->writeable = 1;
        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
@@ -1695,7 +1696,7 @@ out:
        mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 error:
-        close_bdev_exclusive(bdev, 0);
+        blkdev_put(bdev, FMODE_EXCL);
        if (seeding_dev) {
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
@@ -3393,7 +3394,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
                return NULL;
        list_add(&device->dev_list,
                 &fs_devices->devices);
-        device->barriers = 1;
        device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
        device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7af6144a7954..7fb59d45fe8c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -45,7 +45,6 @@ struct btrfs_device {
        int running_pending;
        u64 generation;
-        int barriers;
        int writeable;
        int in_fs_metadata;
        int missing;
@@ -54,7 +53,7 @@ struct btrfs_device {
        struct block_device *bdev;
-        /* the mode sent to open_bdev_exclusive */
+        /* the mode sent to blkdev_get */
        fmode_t mode;
        char *name;
diff --git a/fs/buffer.c b/fs/buffer.c
index 3e7dca279d1c..2219a76e2caf 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
+                if (!quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
@@ -905,7 +905,6 @@ try_again:
                bh->b_state = 0;
                atomic_set(&bh->b_count, 0);
-                bh->b_private = NULL;
                bh->b_size = size;
                /* Link the buffer to its page */
@@ -1271,12 +1270,10 @@ static inline void check_irqs_on(void)
 static void bh_lru_install(struct buffer_head *bh)
 {
        struct buffer_head *evictee = NULL;
-        struct bh_lru *lru;
        check_irqs_on();
        bh_lru_lock();
-        lru = &__get_cpu_var(bh_lrus);
+        if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
-        if (lru->bhs[0] != bh) {
                struct buffer_head *bhs[BH_LRU_SIZE];
                int in;
                int out = 0;
@@ -1284,7 +1281,8 @@ static void bh_lru_install(struct buffer_head *bh)
                get_bh(bh);
                bhs[out++] = bh;
                for (in = 0; in < BH_LRU_SIZE; in++) {
-                        struct buffer_head *bh2 = lru->bhs[in];
+                        struct buffer_head *bh2 =
+                                __this_cpu_read(bh_lrus.bhs[in]);
                        if (bh2 == bh) {
                                __brelse(bh2);
@@ -1299,7 +1297,7 @@ static void bh_lru_install(struct buffer_head *bh)
                }
                while (out < BH_LRU_SIZE)
                        bhs[out++] = NULL;
-                memcpy(lru->bhs, bhs, sizeof(bhs));
+                memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
        }
        bh_lru_unlock();
@@ -1314,23 +1312,22 @@ static struct buffer_head *
 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
 {
        struct buffer_head *ret = NULL;
-        struct bh_lru *lru;
        unsigned int i;
        check_irqs_on();
        bh_lru_lock();
-        lru = &__get_cpu_var(bh_lrus);
        for (i = 0; i < BH_LRU_SIZE; i++) {
-                struct buffer_head *bh = lru->bhs[i];
+                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
                if (bh && bh->b_bdev == bdev &&
                                bh->b_blocknr == block && bh->b_size == size) {
                        if (i) {
                                while (i) {
-                                        lru->bhs[i] = lru->bhs[i - 1];
+                                        __this_cpu_write(bh_lrus.bhs[i],
+                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
                                        i--;
                                }
-                                lru->bhs[0] = bh;
+                                __this_cpu_write(bh_lrus.bhs[0], bh);
                        }
                        get_bh(bh);
                        ret = bh;
@@ -1706,7 +1703,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                 * and kswapd activity, but those code paths have their own
                 * higher-level throttling.
                 */
-                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        redirty_page_for_writepage(wbc, page);
@@ -1834,9 +1831,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
-int block_prepare_write(struct page *page, unsigned from, unsigned to,
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block)
 {
+        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+        unsigned to = from + len;
        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
@@ -1916,7 +1915,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
        }
        return err;
 }
-EXPORT_SYMBOL(block_prepare_write);
+EXPORT_SYMBOL(__block_write_begin);
 static int __block_commit_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to)
@@ -1953,15 +1952,6 @@ static int __block_commit_write(struct inode *inode, struct page *page,
        return 0;
 }
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-                get_block_t *get_block)
-{
-        unsigned start = pos & (PAGE_CACHE_SIZE - 1);
-        return block_prepare_write(page, start, start + len, get_block);
-}
-EXPORT_SYMBOL(__block_write_begin);
 /*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
@@ -2379,7 +2369,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        else
                end = PAGE_CACHE_SIZE;
-        ret = block_prepare_write(page, 0, end, get_block);
+        ret = __block_write_begin(page, 0, end, get_block);
        if (!ret)
                ret = block_commit_write(page, 0, end);
@@ -2466,11 +2456,10 @@ int nobh_write_begin(struct address_space *mapping,
        *fsdata = NULL;
        if (page_has_buffers(page)) {
-                unlock_page(page);
+                ret = __block_write_begin(page, pos, len, get_block);
-                page_cache_release(page);
+                if (unlikely(ret))
-                *pagep = NULL;
+                        goto out_release;
-                return block_write_begin(mapping, pos, len, flags, pagep,
+                return ret;
-                                         get_block);
        }
        if (PageMappedToDisk(page))
@@ -2891,7 +2880,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
        if (err == -EOPNOTSUPP) {
                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-                set_bit(BH_Eopnotsupp, &bh->b_state);
        }
        if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3031,10 +3019,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
                bh->b_end_io = end_buffer_write_sync;
                ret = submit_bh(rw, bh);
                wait_on_buffer(bh);
-                if (buffer_eopnotsupp(bh)) {
-                        clear_buffer_eopnotsupp(bh);
-                        ret = -EOPNOTSUPP;
-                }
                if (!ret && !buffer_uptodate(bh))
                        ret = -EIO;
        } else {
@@ -3217,22 +3201,23 @@ static void recalc_bh_state(void)
        int i;
        int tot = 0;
-        if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
+        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
                return;
-        __get_cpu_var(bh_accounting).ratelimit = 0;
+        __this_cpu_write(bh_accounting.ratelimit, 0);
        for_each_online_cpu(i)
                tot += per_cpu(bh_accounting, i).nr;
        buffer_heads_over_limit = (tot > max_buffer_heads);
 }
-        
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
-                get_cpu_var(bh_accounting).nr++;
+                preempt_disable();
+                __this_cpu_inc(bh_accounting.nr);
                recalc_bh_state();
-                put_cpu_var(bh_accounting);
+                preempt_enable();
        }
        return ret;
 }
@@ -3242,9 +3227,10 @@ void free_buffer_head(struct buffer_head *bh)
 {
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
-        get_cpu_var(bh_accounting).nr--;
+        preempt_disable();
+        __this_cpu_dec(bh_accounting.nr);
        recalc_bh_state();
-        put_cpu_var(bh_accounting);
+        preempt_enable();
 }
 EXPORT_SYMBOL(free_buffer_head);
@@ -3257,9 +3243,8 @@ static void buffer_exit_cpu(int cpu)
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
-        get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
+        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
        per_cpu(bh_accounting, cpu).nr = 0;
-        put_cpu_var(bh_accounting);
 }
 static int buffer_cpu_notify(struct notifier_block *self,
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 727caedcdd92..0a1467b15516 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -55,6 +55,7 @@ const struct file_operations cachefiles_daemon_fops = {
        .read           = cachefiles_daemon_read,
        .write          = cachefiles_daemon_write,
        .poll           = cachefiles_daemon_poll,
+        .llseek         = noop_llseek,
 };
 struct cachefiles_daemon_cmd {
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 0fcd2640c23f..9eb134ea6eb2 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
 config CEPH_FS
        tristate "Ceph distributed file system (EXPERIMENTAL)"
        depends on INET && EXPERIMENTAL
+        select CEPH_LIB
        select LIBCRC32C
        select CRYPTO_AES
        select CRYPTO
+        default n
        help
          Choose Y or M here to include support for mounting the
          experimental Ceph distributed file system.  Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
          If unsure, say N.
-config CEPH_FS_PRETTYDEBUG
-        bool "Include file:line in ceph debug output"
-        depends on CEPH_FS
-        default n
-        help
-          If you say Y here, debug output will include a filename and
-          line to aid debugging.  This icnreases kernel size and slows
-          execution slightly when debug call sites are enabled (e.g.,
-          via CONFIG_DYNAMIC_DEBUG).
-          If unsure, say N.
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 278e1172600d..bd352125e829 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,38 +2,10 @@
 # Makefile for CEPH filesystem.
 #
-ifneq ($(KERNELRELEASE),)
 obj-$(CONFIG_CEPH_FS) += ceph.o
-ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
+ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
        export.o caps.o snap.o xattr.o \
-        messenger.o msgpool.o buffer.o pagelist.o \
+        mds_client.o mdsmap.o strings.o ceph_frag.o \
-        mds_client.o mdsmap.o \
+        debugfs.o
-        mon_client.o \
-        osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
-        debugfs.o \
-        auth.o auth_none.o \
-        crypto.o armor.o \
-        auth_x.o \
-        ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
-else
-#Otherwise we were called directly from the command
-# line; invoke the kernel build system.
-KERNELDIR ?= /lib/modules/$(shell uname -r)/build
-PWD := $(shell pwd)
-default: all
-all:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
-modules_install:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
-clean:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
-endif
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c0..000000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# The following files are shared by (and manually synchronized
-# between) the Ceph userland and kernel client.
-#
-# userland                  kernel
-src/include/ceph_fs.h       fs/ceph/ceph_fs.h
-src/include/ceph_fs.cc      fs/ceph/ceph_fs.c
-src/include/msgr.h          fs/ceph/msgr.h
-src/include/rados.h         fs/ceph/rados.h
-src/include/ceph_strings.cc fs/ceph/ceph_strings.c
-src/include/ceph_frag.h     fs/ceph/ceph_frag.h
-src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
-src/include/ceph_hash.h     fs/ceph/ceph_hash.h
-src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
-src/crush/crush.c           fs/ceph/crush/crush.c
-src/crush/crush.h           fs/ceph/crush/crush.h
-src/crush/mapper.c          fs/ceph/crush/mapper.c
-src/crush/mapper.h          fs/ceph/crush/mapper.h
-src/crush/hash.h            fs/ceph/crush/hash.h
-src/crush/hash.c            fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index efbc604001c8..561438b6a50c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
@@ -10,7 +10,8 @@
 #include <linux/task_io_accounting_ops.h>
 #include "super.h"
-#include "osd_client.h"
+#include "mds_client.h"
+#include <linux/ceph/osd_client.h>
 /*
 * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 {
        struct inode *inode = filp->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+        struct ceph_osd_client *osdc = 
+                &ceph_inode_to_client(inode)->client->osdc;
        int err = 0;
        u64 len = PAGE_CACHE_SIZE;
@@ -202,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
        err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
                                  page->index << PAGE_CACHE_SHIFT, &len,
                                  ci->i_truncate_seq, ci->i_truncate_size,
-                                  &page, 1);
+                                  &page, 1, 0);
        if (err == -ENOENT)
                err = 0;
        if (err < 0) {
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+        struct ceph_osd_client *osdc =
+                &ceph_inode_to_client(inode)->client->osdc;
        int rc = 0;
        struct page **pages;
        loff_t offset;
@@ -284,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
                                 offset, &len,
                                 ci->i_truncate_seq, ci->i_truncate_size,
-                                 pages, nr_pages);
+                                 pages, nr_pages, 0);
        if (rc == -ENOENT)
                rc = 0;
        if (rc < 0)
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 {
        struct inode *inode;
        struct ceph_inode_info *ci;
-        struct ceph_client *client;
+        struct ceph_fs_client *fsc;
        struct ceph_osd_client *osdc;
        loff_t page_off = page->index << PAGE_CACHE_SHIFT;
        int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        }
        inode = page->mapping->host;
        ci = ceph_inode(inode);
-        client = ceph_inode_to_client(inode);
+        fsc = ceph_inode_to_client(inode);
-        osdc = &client->osdc;
+        osdc = &fsc->client->osdc;
        /* verify this is a writeable snap context */
        snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
             inode, page, page->index, page_off, len, snapc);
-        writeback_stat = atomic_long_inc_return(&client->writeback_count);
+        writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
        if (writeback_stat >
-            CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+            CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
-                set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+                set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
        set_page_writeback(page);
        err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
        struct address_space *mapping = inode->i_mapping;
        __s32 rc = -EIO;
        u64 bytes = 0;
-        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        long writeback_stat;
        unsigned issued = ceph_caps_issued(ci);
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
                WARN_ON(!PageUptodate(page));
                writeback_stat =
-                        atomic_long_dec_return(&client->writeback_count);
+                        atomic_long_dec_return(&fsc->writeback_count);
                if (writeback_stat <
-                    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+                    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
-                        clear_bdi_congested(&client->backing_dev_info,
+                        clear_bdi_congested(&fsc->backing_dev_info,
                                            BLK_RW_ASYNC);
                ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
 * mempool.  we avoid the mempool if we can because req->r_num_pages
 * may be less than the maximum write size.
 */
-static void alloc_page_vec(struct ceph_client *client,
+static void alloc_page_vec(struct ceph_fs_client *fsc,
                           struct ceph_osd_request *req)
 {
        req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
                               GFP_NOFS);
        if (!req->r_pages) {
-                req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+                req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
                req->r_pages_from_pool = 1;
                WARN_ON(!req->r_pages);
        }
@@ -588,9 +591,8 @@ static int ceph_writepages_start(struct address_space *mapping,
                                 struct writeback_control *wbc)
 {
        struct inode *inode = mapping->host;
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_client *client;
+        struct ceph_fs_client *fsc;
        pgoff_t index, start, end;
        int range_whole = 0;
        int should_loop = 1;
@@ -617,26 +619,19 @@ static int ceph_writepages_start(struct address_space *mapping,
             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
-        client = ceph_inode_to_client(inode);
+        fsc = ceph_inode_to_client(inode);
-        if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+        if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
                pr_warning("writepage_start %p on forced umount\n", inode);
                return -EIO; /* we're in a forced umount, don't write! */
        }
-        if (client->mount_args->wsize && client->mount_args->wsize < wsize)
+        if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
-                wsize = client->mount_args->wsize;
+                wsize = fsc->mount_options->wsize;
        if (wsize < PAGE_CACHE_SIZE)
                wsize = PAGE_CACHE_SIZE;
        max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
        pagevec_init(&pvec, 0);
-        /* ?? */
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                dout(" writepages congested\n");
-                wbc->encountered_congestion = 1;
-                goto out_final;
-        }
        /* where to start/end? */
        if (wbc->range_cyclic) {
                start = mapping->writeback_index; /* Start from prev offset */
@@ -769,7 +764,7 @@ get_more_pages:
                                offset = (unsigned long long)page->index
                                        << PAGE_CACHE_SHIFT;
                                len = wsize;
-                                req = ceph_osdc_new_request(&client->osdc,
+                                req = ceph_osdc_new_request(&fsc->client->osdc,
                                            &ci->i_layout,
                                            ceph_vino(inode),
                                            offset, &len,
@@ -779,10 +774,10 @@ get_more_pages:
                                            snapc, do_sync,
                                            ci->i_truncate_seq,
                                            ci->i_truncate_size,
-                                            &inode->i_mtime, true, 1);
+                                            &inode->i_mtime, true, 1, 0);
                                max_pages = req->r_num_pages;
-                                alloc_page_vec(client, req);
+                                alloc_page_vec(fsc, req);
                                req->r_callback = writepages_finish;
                                req->r_inode = inode;
                        }
@@ -794,10 +789,10 @@ get_more_pages:
                             inode, page, page->index);
                        writeback_stat =
-                               atomic_long_inc_return(&client->writeback_count);
+                               atomic_long_inc_return(&fsc->writeback_count);
                        if (writeback_stat > CONGESTION_ON_THRESH(
-                                    client->mount_args->congestion_kb)) {
+                                    fsc->mount_options->congestion_kb)) {
-                                set_bdi_congested(&client->backing_dev_info,
+                                set_bdi_congested(&fsc->backing_dev_info,
                                                  BLK_RW_ASYNC);
                        }
@@ -846,7 +841,7 @@ get_more_pages:
                op->payload_len = cpu_to_le32(len);
                req->r_request->hdr.data_len = cpu_to_le32(len);
-                ceph_osdc_start_request(&client->osdc, req, true);
+                ceph_osdc_start_request(&fsc->client->osdc, req, true);
                req = NULL;
                /* continue? */
@@ -882,7 +877,6 @@ out:
                rc = 0;  /* vfs expects us to return 0 */
        ceph_put_snap_context(snapc);
        dout("writepages done, rc = %d\n", rc);
-out_final:
        return rc;
 }
@@ -915,7 +909,7 @@ static int ceph_update_writeable_page(struct file *file,
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        loff_t page_off = pos & PAGE_CACHE_MASK;
        int pos_in_page = pos & ~PAGE_CACHE_MASK;
        int end_in_page = pos_in_page + len;
@@ -1053,8 +1047,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
                          struct page *page, void *fsdata)
 {
        struct inode *inode = file->f_dentry->d_inode;
-        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
        int check_cap = 0;
@@ -1123,7 +1117,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct inode *inode = vma->vm_file->f_dentry->d_inode;
        struct page *page = vmf->page;
-        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        loff_t off = page->index << PAGE_CACHE_SHIFT;
        loff_t size, len;
        int ret;
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
deleted file mode 100644
index eb2a666b0be7..000000000000
--- a/fs/ceph/armor.c
+++ /dev/null
@@ -1,103 +0,0 @@
-#include <linux/errno.h>
-int ceph_armor(char *dst, const char *src, const char *end);
-int ceph_unarmor(char *dst, const char *src, const char *end);
-/*
- * base64 encode/decode.
- */
-static const char *pem_key =
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-static int encode_bits(int c)
-{
-        return pem_key[c];
-}
-static int decode_bits(char c)
-{
-        if (c >= 'A' && c <= 'Z')
-                return c - 'A';
-        if (c >= 'a' && c <= 'z')
-                return c - 'a' + 26;
-        if (c >= '0' && c <= '9')
-                return c - '0' + 52;
-        if (c == '+')
-                return 62;
-        if (c == '/')
-                return 63;
-        if (c == '=')
-                return 0; /* just non-negative, please */
-        return -EINVAL;
-}
-int ceph_armor(char *dst, const char *src, const char *end)
-{
-        int olen = 0;
-        int line = 0;
-        while (src < end) {
-                unsigned char a, b, c;
-                a = *src++;
-                *dst++ = encode_bits(a >> 2);
-                if (src < end) {
-                        b = *src++;
-                        *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
-                        if (src < end) {
-                                c = *src++;
-                                *dst++ = encode_bits(((b & 15) << 2) |
-                                                     (c >> 6));
-                                *dst++ = encode_bits(c & 63);
-                        } else {
-                                *dst++ = encode_bits((b & 15) << 2);
-                                *dst++ = '=';
-                        }
-                } else {
-                        *dst++ = encode_bits(((a & 3) << 4));
-                        *dst++ = '=';
-                        *dst++ = '=';
-                }
-                olen += 4;
-                line += 4;
-                if (line == 64) {
-                        line = 0;
-                        *(dst++) = '\n';
-                        olen++;
-                }
-        }
-        return olen;
-}
-int ceph_unarmor(char *dst, const char *src, const char *end)
-{
-        int olen = 0;
-        while (src < end) {
-                int a, b, c, d;
-                if (src < end && src[0] == '\n')
-                        src++;
-                if (src + 4 > end)
-                        return -EINVAL;
-                a = decode_bits(src[0]);
-                b = decode_bits(src[1]);
-                c = decode_bits(src[2]);
-                d = decode_bits(src[3]);
-                if (a < 0 || b < 0 || c < 0 || d < 0)
-                        return -EINVAL;
-                *dst++ = (a << 2) | (b >> 4);
-                if (src[2] == '=')
-                        return olen + 1;
-                *dst++ = ((b & 15) << 4) | (c >> 2);
-                if (src[3] == '=')
-                        return olen + 2;
-                *dst++ = ((c & 3) << 6) | d;
-                olen += 3;
-                src += 4;
-        }
-        return olen;
-}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
deleted file mode 100644
index 6d2e30600627..000000000000
--- a/fs/ceph/auth.c
+++ /dev/null
@@ -1,259 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include "types.h"
-#include "auth_none.h"
-#include "auth_x.h"
-#include "decode.h"
-#include "super.h"
-#include "messenger.h"
-/*
- * get protocol handler
- */
-static u32 supported_protocols[] = {
-        CEPH_AUTH_NONE,
-        CEPH_AUTH_CEPHX
-};
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
-{
-        switch (protocol) {
-        case CEPH_AUTH_NONE:
-                return ceph_auth_none_init(ac);
-        case CEPH_AUTH_CEPHX:
-                return ceph_x_init(ac);
-        default:
-                return -ENOENT;
-        }
-}
-/*
- * setup, teardown.
- */
-struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
-{
-        struct ceph_auth_client *ac;
-        int ret;
-        dout("auth_init name '%s' secret '%s'\n", name, secret);
-        ret = -ENOMEM;
-        ac = kzalloc(sizeof(*ac), GFP_NOFS);
-        if (!ac)
-                goto out;
-        ac->negotiating = true;
-        if (name)
-                ac->name = name;
-        else
-                ac->name = CEPH_AUTH_NAME_DEFAULT;
-        dout("auth_init name %s secret %s\n", ac->name, secret);
-        ac->secret = secret;
-        return ac;
-out:
-        return ERR_PTR(ret);
-}
-void ceph_auth_destroy(struct ceph_auth_client *ac)
-{
-        dout("auth_destroy %p\n", ac);
-        if (ac->ops)
-                ac->ops->destroy(ac);
-        kfree(ac);
-}
-/*
- * Reset occurs when reconnecting to the monitor.
- */
-void ceph_auth_reset(struct ceph_auth_client *ac)
-{
-        dout("auth_reset %p\n", ac);
-        if (ac->ops && !ac->negotiating)
-                ac->ops->reset(ac);
-        ac->negotiating = true;
-}
-int ceph_entity_name_encode(const char *name, void **p, void *end)
-{
-        int len = strlen(name);
-        if (*p + 2*sizeof(u32) + len > end)
-                return -ERANGE;
-        ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
-        ceph_encode_32(p, len);
-        ceph_encode_copy(p, name, len);
-        return 0;
-}
-/*
- * Initiate protocol negotiation with monitor.  Include entity name
- * and list supported protocols.
- */
-int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
-{
-        struct ceph_mon_request_header *monhdr = buf;
-        void *p = monhdr + 1, *end = buf + len, *lenp;
-        int i, num;
-        int ret;
-        dout("auth_build_hello\n");
-        monhdr->have_version = 0;
-        monhdr->session_mon = cpu_to_le16(-1);
-        monhdr->session_mon_tid = 0;
-        ceph_encode_32(&p, 0);  /* no protocol, yet */
-        lenp = p;
-        p += sizeof(u32);
-        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-        ceph_encode_8(&p, 1);
-        num = ARRAY_SIZE(supported_protocols);
-        ceph_encode_32(&p, num);
-        ceph_decode_need(&p, end, num * sizeof(u32), bad);
-        for (i = 0; i < num; i++)
-                ceph_encode_32(&p, supported_protocols[i]);
-        ret = ceph_entity_name_encode(ac->name, &p, end);
-        if (ret < 0)
-                return ret;
-        ceph_decode_need(&p, end, sizeof(u64), bad);
-        ceph_encode_64(&p, ac->global_id);
-        ceph_encode_32(&lenp, p - lenp - sizeof(u32));
-        return p - buf;
-bad:
-        return -ERANGE;
-}
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
-                                   void *msg_buf, size_t msg_len)
-{
-        struct ceph_mon_request_header *monhdr = msg_buf;
-        void *p = monhdr + 1;
-        void *end = msg_buf + msg_len;
-        int ret;
-        monhdr->have_version = 0;
-        monhdr->session_mon = cpu_to_le16(-1);
-        monhdr->session_mon_tid = 0;
-        ceph_encode_32(&p, ac->protocol);
-        ret = ac->ops->build_request(ac, p + sizeof(u32), end);
-        if (ret < 0) {
-                pr_err("error %d building auth method %s request\n", ret,
-                       ac->ops->name);
-                return ret;
-        }
-        dout(" built request %d bytes\n", ret);
-        ceph_encode_32(&p, ret);
-        return p + ret - msg_buf;
-}
-/*
- * Handle auth message from monitor.
- */
-int ceph_handle_auth_reply(struct ceph_auth_client *ac,
-                           void *buf, size_t len,
-                           void *reply_buf, size_t reply_len)
-{
-        void *p = buf;
-        void *end = buf + len;
-        int protocol;
-        s32 result;
-        u64 global_id;
-        void *payload, *payload_end;
-        int payload_len;
-        char *result_msg;
-        int result_msg_len;
-        int ret = -EINVAL;
-        dout("handle_auth_reply %p %p\n", p, end);
-        ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
-        protocol = ceph_decode_32(&p);
-        result = ceph_decode_32(&p);
-        global_id = ceph_decode_64(&p);
-        payload_len = ceph_decode_32(&p);
-        payload = p;
-        p += payload_len;
-        ceph_decode_need(&p, end, sizeof(u32), bad);
-        result_msg_len = ceph_decode_32(&p);
-        result_msg = p;
-        p += result_msg_len;
-        if (p != end)
-                goto bad;
-        dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
-             result_msg, global_id, payload_len);
-        payload_end = payload + payload_len;
-        if (global_id && ac->global_id != global_id) {
-                dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
-                ac->global_id = global_id;
-        }
-        if (ac->negotiating) {
-                /* server does not support our protocols? */
-                if (!protocol && result < 0) {
-                        ret = result;
-                        goto out;
-                }
-                /* set up (new) protocol handler? */
-                if (ac->protocol && ac->protocol != protocol) {
-                        ac->ops->destroy(ac);
-                        ac->protocol = 0;
-                        ac->ops = NULL;
-                }
-                if (ac->protocol != protocol) {
-                        ret = ceph_auth_init_protocol(ac, protocol);
-                        if (ret) {
-                                pr_err("error %d on auth protocol %d init\n",
-                                       ret, protocol);
-                                goto out;
-                        }
-                }
-                ac->negotiating = false;
-        }
-        ret = ac->ops->handle_reply(ac, result, payload, payload_end);
-        if (ret == -EAGAIN) {
-                return ceph_build_auth_request(ac, reply_buf, reply_len);
-        } else if (ret) {
-                pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-                return ret;
-        }
-        return 0;
-bad:
-        pr_err("failed to decode auth msg\n");
-out:
-        return ret;
-}
-int ceph_build_auth(struct ceph_auth_client *ac,
-                    void *msg_buf, size_t msg_len)
-{
-        if (!ac->protocol)
-                return ceph_auth_build_hello(ac, msg_buf, msg_len);
-        BUG_ON(!ac->ops);
-        if (ac->ops->should_authenticate(ac))
-                return ceph_build_auth_request(ac, msg_buf, msg_len);
-        return 0;
-}
-int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
-{
-        if (!ac->ops)
-                return 0;
-        return ac->ops->is_authenticated(ac);
-}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
deleted file mode 100644
index d38a2fb4a137..000000000000
--- a/fs/ceph/auth.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _FS_CEPH_AUTH_H
-#define _FS_CEPH_AUTH_H
-#include "types.h"
-#include "buffer.h"
-/*
- * Abstract interface for communicating with the authenticate module.
- * There is some handshake that takes place between us and the monitor
- * to acquire the necessary keys.  These are used to generate an
- * 'authorizer' that we use when connecting to a service (mds, osd).
- */
-struct ceph_auth_client;
-struct ceph_authorizer;
-struct ceph_auth_client_ops {
-        const char *name;
-        /*
-         * true if we are authenticated and can connect to
-         * services.
-         */
-        int (*is_authenticated)(struct ceph_auth_client *ac);
-        /*
-         * true if we should (re)authenticate, e.g., when our tickets
-         * are getting old and crusty.
-         */
-        int (*should_authenticate)(struct ceph_auth_client *ac);
-        /*
-         * build requests and process replies during monitor
-         * handshake.  if handle_reply returns -EAGAIN, we build
-         * another request.
-         */
-        int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
-        int (*handle_reply)(struct ceph_auth_client *ac, int result,
-                            void *buf, void *end);
-        /*
-         * Create authorizer for connecting to a service, and verify
-         * the response to authenticate the service.
-         */
-        int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
-                                 struct ceph_authorizer **a,
-                                 void **buf, size_t *len,
-                                 void **reply_buf, size_t *reply_len);
-        int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
-                                       struct ceph_authorizer *a, size_t len);
-        void (*destroy_authorizer)(struct ceph_auth_client *ac,
-                                   struct ceph_authorizer *a);
-        void (*invalidate_authorizer)(struct ceph_auth_client *ac,
-                                      int peer_type);
-        /* reset when we (re)connect to a monitor */
-        void (*reset)(struct ceph_auth_client *ac);
-        void (*destroy)(struct ceph_auth_client *ac);
-};
-struct ceph_auth_client {
-        u32 protocol;           /* CEPH_AUTH_* */
-        void *private;          /* for use by protocol implementation */
-        const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
-        bool negotiating;       /* true if negotiating protocol */
-        const char *name;       /* entity name */
-        u64 global_id;          /* our unique id in system */
-        const char *secret;     /* our secret key */
-        unsigned want_keys;     /* which services we want */
-};
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
-                                               const char *secret);
-extern void ceph_auth_destroy(struct ceph_auth_client *ac);
-extern void ceph_auth_reset(struct ceph_auth_client *ac);
-extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
-                                 void *buf, size_t len);
-extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
-                                  void *buf, size_t len,
-                                  void *reply_buf, size_t reply_len);
-extern int ceph_entity_name_encode(const char *name, void **p, void *end);
-extern int ceph_build_auth(struct ceph_auth_client *ac,
-                    void *msg_buf, size_t msg_len);
-extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
-#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
deleted file mode 100644
index ad1dc21286c7..000000000000
--- a/fs/ceph/auth_none.c
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include "auth_none.h"
-#include "auth.h"
-#include "decode.h"
-static void reset(struct ceph_auth_client *ac)
-{
-        struct ceph_auth_none_info *xi = ac->private;
-        xi->starting = true;
-        xi->built_authorizer = false;
-}
-static void destroy(struct ceph_auth_client *ac)
-{
-        kfree(ac->private);
-        ac->private = NULL;
-}
-static int is_authenticated(struct ceph_auth_client *ac)
-{
-        struct ceph_auth_none_info *xi = ac->private;
-        return !xi->starting;
-}
-static int should_authenticate(struct ceph_auth_client *ac)
-{
-        struct ceph_auth_none_info *xi = ac->private;
-        return xi->starting;
-}
-/*
- * the generic auth code decode the global_id, and we carry no actual
- * authenticate state, so nothing happens here.
- */
-static int handle_reply(struct ceph_auth_client *ac, int result,
-                        void *buf, void *end)
-{
-        struct ceph_auth_none_info *xi = ac->private;
-        xi->starting = false;
-        return result;
-}
-/*
- * build an 'authorizer' with our entity_name and global_id.  we can
- * reuse a single static copy since it is identical for all services
- * we connect to.
- */
-static int ceph_auth_none_create_authorizer(
-        struct ceph_auth_client *ac, int peer_type,
-        struct ceph_authorizer **a,
-        void **buf, size_t *len,
-        void **reply_buf, size_t *reply_len)
-{
-        struct ceph_auth_none_info *ai = ac->private;
-        struct ceph_none_authorizer *au = &ai->au;
-        void *p, *end;
-        int ret;
-        if (!ai->built_authorizer) {
-                p = au->buf;
-                end = p + sizeof(au->buf);
-                ceph_encode_8(&p, 1);
-                ret = ceph_entity_name_encode(ac->name, &p, end - 8);
-                if (ret < 0)
-                        goto bad;
-                ceph_decode_need(&p, end, sizeof(u64), bad2);
-                ceph_encode_64(&p, ac->global_id);
-                au->buf_len = p - (void *)au->buf;
-                ai->built_authorizer = true;
-                dout("built authorizer len %d\n", au->buf_len);
-        }
-        *a = (struct ceph_authorizer *)au;
-        *buf = au->buf;
-        *len = au->buf_len;
-        *reply_buf = au->reply_buf;
-        *reply_len = sizeof(au->reply_buf);
-        return 0;
-bad2:
-        ret = -ERANGE;
-bad:
-        return ret;
-}
-static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
-                                      struct ceph_authorizer *a)
-{
-        /* nothing to do */
-}
-static const struct ceph_auth_client_ops ceph_auth_none_ops = {
-        .name = "none",
-        .reset = reset,
-        .destroy = destroy,
-        .is_authenticated = is_authenticated,
-        .should_authenticate = should_authenticate,
-        .handle_reply = handle_reply,
-        .create_authorizer = ceph_auth_none_create_authorizer,
-        .destroy_authorizer = ceph_auth_none_destroy_authorizer,
-};
-int ceph_auth_none_init(struct ceph_auth_client *ac)
-{
-        struct ceph_auth_none_info *xi;
-        dout("ceph_auth_none_init %p\n", ac);
-        xi = kzalloc(sizeof(*xi), GFP_NOFS);
-        if (!xi)
-                return -ENOMEM;
-        xi->starting = true;
-        xi->built_authorizer = false;
-        ac->protocol = CEPH_AUTH_NONE;
-        ac->private = xi;
-        ac->ops = &ceph_auth_none_ops;
-        return 0;
-}
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
deleted file mode 100644
index 8164df1a08be..000000000000
--- a/fs/ceph/auth_none.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _FS_CEPH_AUTH_NONE_H
-#define _FS_CEPH_AUTH_NONE_H
-#include <linux/slab.h>
-#include "auth.h"
-/*
- * null security mode.
- *
- * we use a single static authorizer that simply encodes our entity name
- * and global id.
- */
-struct ceph_none_authorizer {
-        char buf[128];
-        int buf_len;
-        char reply_buf[0];
-};
-struct ceph_auth_none_info {
-        bool starting;
-        bool built_authorizer;
-        struct ceph_none_authorizer au;   /* we only need one; it's static */
-};
-extern int ceph_auth_none_init(struct ceph_auth_client *ac);
-#endif
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
deleted file mode 100644
index a2d002cbdec2..000000000000
--- a/fs/ceph/auth_x.c
+++ /dev/null
@@ -1,687 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include "auth_x.h"
-#include "auth_x_protocol.h"
-#include "crypto.h"
-#include "auth.h"
-#include "decode.h"
-#define TEMP_TICKET_BUF_LEN     256
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
-static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
-{
-        struct ceph_x_info *xi = ac->private;
-        int need;
-        ceph_x_validate_tickets(ac, &need);
-        dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
-             ac->want_keys, need, xi->have_keys);
-        return (ac->want_keys & xi->have_keys) == ac->want_keys;
-}
-static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
-{
-        struct ceph_x_info *xi = ac->private;
-        int need;
-        ceph_x_validate_tickets(ac, &need);
-        dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
-             ac->want_keys, need, xi->have_keys);
-        return need != 0;
-}
-static int ceph_x_encrypt_buflen(int ilen)
-{
-        return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
-                sizeof(u32);
-}
-static int ceph_x_encrypt(struct ceph_crypto_key *secret,
-                          void *ibuf, int ilen, void *obuf, size_t olen)
-{
-        struct ceph_x_encrypt_header head = {
-                .struct_v = 1,
-                .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
-        };
-        size_t len = olen - sizeof(u32);
-        int ret;
-        ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
-                            &head, sizeof(head), ibuf, ilen);
-        if (ret)
-                return ret;
-        ceph_encode_32(&obuf, len);
-        return len + sizeof(u32);
-}
-static int ceph_x_decrypt(struct ceph_crypto_key *secret,
-                          void **p, void *end, void *obuf, size_t olen)
-{
-        struct ceph_x_encrypt_header head;
-        size_t head_len = sizeof(head);
-        int len, ret;
-        len = ceph_decode_32(p);
-        if (*p + len > end)
-                return -EINVAL;
-        dout("ceph_x_decrypt len %d\n", len);
-        ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
-                            *p, len);
-        if (ret)
-                return ret;
-        if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
-                return -EPERM;
-        *p += len;
-        return olen;
-}
-/*
- * get existing (or insert new) ticket handler
- */
-static struct ceph_x_ticket_handler *
-get_ticket_handler(struct ceph_auth_client *ac, int service)
-{
-        struct ceph_x_ticket_handler *th;
-        struct ceph_x_info *xi = ac->private;
-        struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
-        while (*p) {
-                parent = *p;
-                th = rb_entry(parent, struct ceph_x_ticket_handler, node);
-                if (service < th->service)
-                        p = &(*p)->rb_left;
-                else if (service > th->service)
-                        p = &(*p)->rb_right;
-                else
-                        return th;
-        }
-        /* add it */
-        th = kzalloc(sizeof(*th), GFP_NOFS);
-        if (!th)
-                return ERR_PTR(-ENOMEM);
-        th->service = service;
-        rb_link_node(&th->node, parent, p);
-        rb_insert_color(&th->node, &xi->ticket_handlers);
-        return th;
-}
-static void remove_ticket_handler(struct ceph_auth_client *ac,
-                                  struct ceph_x_ticket_handler *th)
-{
-        struct ceph_x_info *xi = ac->private;
-        dout("remove_ticket_handler %p %d\n", th, th->service);
-        rb_erase(&th->node, &xi->ticket_handlers);
-        ceph_crypto_key_destroy(&th->session_key);
-        if (th->ticket_blob)
-                ceph_buffer_put(th->ticket_blob);
-        kfree(th);
-}
-static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
-                                    struct ceph_crypto_key *secret,
-                                    void *buf, void *end)
-{
-        struct ceph_x_info *xi = ac->private;
-        int num;
-        void *p = buf;
-        int ret;
-        char *dbuf;
-        char *ticket_buf;
-        u8 reply_struct_v;
-        dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-        if (!dbuf)
-                return -ENOMEM;
-        ret = -ENOMEM;
-        ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-        if (!ticket_buf)
-                goto out_dbuf;
-        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-        reply_struct_v = ceph_decode_8(&p);
-        if (reply_struct_v != 1)
-                goto bad;
-        num = ceph_decode_32(&p);
-        dout("%d tickets\n", num);
-        while (num--) {
-                int type;
-                u8 tkt_struct_v, blob_struct_v;
-                struct ceph_x_ticket_handler *th;
-                void *dp, *dend;
-                int dlen;
-                char is_enc;
-                struct timespec validity;
-                struct ceph_crypto_key old_key;
-                void *tp, *tpend;
-                struct ceph_timespec new_validity;
-                struct ceph_crypto_key new_session_key;
-                struct ceph_buffer *new_ticket_blob;
-                unsigned long new_expires, new_renew_after;
-                u64 new_secret_id;
-                ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
-                type = ceph_decode_32(&p);
-                dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
-                tkt_struct_v = ceph_decode_8(&p);
-                if (tkt_struct_v != 1)
-                        goto bad;
-                th = get_ticket_handler(ac, type);
-                if (IS_ERR(th)) {
-                        ret = PTR_ERR(th);
-                        goto out;
-                }
-                /* blob for me */
-                dlen = ceph_x_decrypt(secret, &p, end, dbuf,
-                                      TEMP_TICKET_BUF_LEN);
-                if (dlen <= 0) {
-                        ret = dlen;
-                        goto out;
-                }
-                dout(" decrypted %d bytes\n", dlen);
-                dend = dbuf + dlen;
-                dp = dbuf;
-                tkt_struct_v = ceph_decode_8(&dp);
-                if (tkt_struct_v != 1)
-                        goto bad;
-                memcpy(&old_key, &th->session_key, sizeof(old_key));
-                ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
-                if (ret)
-                        goto out;
-                ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
-                ceph_decode_timespec(&validity, &new_validity);
-                new_expires = get_seconds() + validity.tv_sec;
-                new_renew_after = new_expires - (validity.tv_sec / 4);
-                dout(" expires=%lu renew_after=%lu\n", new_expires,
-                     new_renew_after);
-                /* ticket blob for service */
-                ceph_decode_8_safe(&p, end, is_enc, bad);
-                tp = ticket_buf;
-                if (is_enc) {
-                        /* encrypted */
-                        dout(" encrypted ticket\n");
-                        dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
-                                              TEMP_TICKET_BUF_LEN);
-                        if (dlen < 0) {
-                                ret = dlen;
-                                goto out;
-                        }
-                        dlen = ceph_decode_32(&tp);
-                } else {
-                        /* unencrypted */
-                        ceph_decode_32_safe(&p, end, dlen, bad);
-                        ceph_decode_need(&p, end, dlen, bad);
-                        ceph_decode_copy(&p, ticket_buf, dlen);
-                }
-                tpend = tp + dlen;
-                dout(" ticket blob is %d bytes\n", dlen);
-                ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-                blob_struct_v = ceph_decode_8(&tp);
-                new_secret_id = ceph_decode_64(&tp);
-                ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
-                if (ret)
-                        goto out;
-                /* all is well, update our ticket */
-                ceph_crypto_key_destroy(&th->session_key);
-                if (th->ticket_blob)
-                        ceph_buffer_put(th->ticket_blob);
-                th->session_key = new_session_key;
-                th->ticket_blob = new_ticket_blob;
-                th->validity = new_validity;
-                th->secret_id = new_secret_id;
-                th->expires = new_expires;
-                th->renew_after = new_renew_after;
-                dout(" got ticket service %d (%s) secret_id %lld len %d\n",
-                     type, ceph_entity_type_name(type), th->secret_id,
-                     (int)th->ticket_blob->vec.iov_len);
-                xi->have_keys |= th->service;
-        }
-        ret = 0;
-out:
-        kfree(ticket_buf);
-out_dbuf:
-        kfree(dbuf);
-        return ret;
-bad:
-        ret = -EINVAL;
-        goto out;
-}
-static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
-                                   struct ceph_x_ticket_handler *th,
-                                   struct ceph_x_authorizer *au)
-{
-        int maxlen;
-        struct ceph_x_authorize_a *msg_a;
-        struct ceph_x_authorize_b msg_b;
-        void *p, *end;
-        int ret;
-        int ticket_blob_len =
-                (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
-        dout("build_authorizer for %s %p\n",
-             ceph_entity_type_name(th->service), au);
-        maxlen = sizeof(*msg_a) + sizeof(msg_b) +
-                ceph_x_encrypt_buflen(ticket_blob_len);
-        dout("  need len %d\n", maxlen);
-        if (au->buf && au->buf->alloc_len < maxlen) {
-                ceph_buffer_put(au->buf);
-                au->buf = NULL;
-        }
-        if (!au->buf) {
-                au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
-                if (!au->buf)
-                        return -ENOMEM;
-        }
-        au->service = th->service;
-        msg_a = au->buf->vec.iov_base;
-        msg_a->struct_v = 1;
-        msg_a->global_id = cpu_to_le64(ac->global_id);
-        msg_a->service_id = cpu_to_le32(th->service);
-        msg_a->ticket_blob.struct_v = 1;
-        msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
-        msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
-        if (ticket_blob_len) {
-                memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
-                       th->ticket_blob->vec.iov_len);
-        }
-        dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
-             le64_to_cpu(msg_a->ticket_blob.secret_id));
-        p = msg_a + 1;
-        p += ticket_blob_len;
-        end = au->buf->vec.iov_base + au->buf->vec.iov_len;
-        get_random_bytes(&au->nonce, sizeof(au->nonce));
-        msg_b.struct_v = 1;
-        msg_b.nonce = cpu_to_le64(au->nonce);
-        ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
-                             p, end - p);
-        if (ret < 0)
-                goto out_buf;
-        p += ret;
-        au->buf->vec.iov_len = p - au->buf->vec.iov_base;
-        dout(" built authorizer nonce %llx len %d\n", au->nonce,
-             (int)au->buf->vec.iov_len);
-        BUG_ON(au->buf->vec.iov_len > maxlen);
-        return 0;
-out_buf:
-        ceph_buffer_put(au->buf);
-        au->buf = NULL;
-        return ret;
-}
-static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
-                                void **p, void *end)
-{
-        ceph_decode_need(p, end, 1 + sizeof(u64), bad);
-        ceph_encode_8(p, 1);
-        ceph_encode_64(p, th->secret_id);
-        if (th->ticket_blob) {
-                const char *buf = th->ticket_blob->vec.iov_base;
-                u32 len = th->ticket_blob->vec.iov_len;
-                ceph_encode_32_safe(p, end, len, bad);
-                ceph_encode_copy_safe(p, end, buf, len, bad);
-        } else {
-                ceph_encode_32_safe(p, end, 0, bad);
-        }
-        return 0;
-bad:
-        return -ERANGE;
-}
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
-{
-        int want = ac->want_keys;
-        struct ceph_x_info *xi = ac->private;
-        int service;
-        *pneed = ac->want_keys & ~(xi->have_keys);
-        for (service = 1; service <= want; service <<= 1) {
-                struct ceph_x_ticket_handler *th;
-                if (!(ac->want_keys & service))
-                        continue;
-                if (*pneed & service)
-                        continue;
-                th = get_ticket_handler(ac, service);
-                if (IS_ERR(th)) {
-                        *pneed |= service;
-                        continue;
-                }
-                if (get_seconds() >= th->renew_after)
-                        *pneed |= service;
-                if (get_seconds() >= th->expires)
-                        xi->have_keys &= ~service;
-        }
-}
-static int ceph_x_build_request(struct ceph_auth_client *ac,
-                                void *buf, void *end)
-{
-        struct ceph_x_info *xi = ac->private;
-        int need;
-        struct ceph_x_request_header *head = buf;
-        int ret;
-        struct ceph_x_ticket_handler *th =
-                get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-        if (IS_ERR(th))
-                return PTR_ERR(th);
-        ceph_x_validate_tickets(ac, &need);
-        dout("build_request want %x have %x need %x\n",
-             ac->want_keys, xi->have_keys, need);
-        if (need & CEPH_ENTITY_TYPE_AUTH) {
-                struct ceph_x_authenticate *auth = (void *)(head + 1);
-                void *p = auth + 1;
-                struct ceph_x_challenge_blob tmp;
-                char tmp_enc[40];
-                u64 *u;
-                if (p > end)
-                        return -ERANGE;
-                dout(" get_auth_session_key\n");
-                head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
-                /* encrypt and hash */
-                get_random_bytes(&auth->client_challenge, sizeof(u64));
-                tmp.client_challenge = auth->client_challenge;
-                tmp.server_challenge = cpu_to_le64(xi->server_challenge);
-                ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
-                                     tmp_enc, sizeof(tmp_enc));
-                if (ret < 0)
-                        return ret;
-                auth->struct_v = 1;
-                auth->key = 0;
-                for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
-                        auth->key ^= *(__le64 *)u;
-                dout(" server_challenge %llx client_challenge %llx key %llx\n",
-                     xi->server_challenge, le64_to_cpu(auth->client_challenge),
-                     le64_to_cpu(auth->key));
-                /* now encode the old ticket if exists */
-                ret = ceph_x_encode_ticket(th, &p, end);
-                if (ret < 0)
-                        return ret;
-                return p - buf;
-        }
-        if (need) {
-                void *p = head + 1;
-                struct ceph_x_service_ticket_request *req;
-                if (p > end)
-                        return -ERANGE;
-                head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-                ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
-                if (ret)
-                        return ret;
-                ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
-                                 xi->auth_authorizer.buf->vec.iov_len);
-                req = p;
-                req->keys = cpu_to_le32(need);
-                p += sizeof(*req);
-                return p - buf;
-        }
-        return 0;
-}
-static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
-                               void *buf, void *end)
-{
-        struct ceph_x_info *xi = ac->private;
-        struct ceph_x_reply_header *head = buf;
-        struct ceph_x_ticket_handler *th;
-        int len = end - buf;
-        int op;
-        int ret;
-        if (result)
-                return result;  /* XXX hmm? */
-        if (xi->starting) {
-                /* it's a hello */
-                struct ceph_x_server_challenge *sc = buf;
-                if (len != sizeof(*sc))
-                        return -EINVAL;
-                xi->server_challenge = le64_to_cpu(sc->server_challenge);
-                dout("handle_reply got server challenge %llx\n",
-                     xi->server_challenge);
-                xi->starting = false;
-                xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
-                return -EAGAIN;
-        }
-        op = le16_to_cpu(head->op);
-        result = le32_to_cpu(head->result);
-        dout("handle_reply op %d result %d\n", op, result);
-        switch (op) {
-        case CEPHX_GET_AUTH_SESSION_KEY:
-                /* verify auth key */
-                ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
-                                               buf + sizeof(*head), end);
-                break;
-        case CEPHX_GET_PRINCIPAL_SESSION_KEY:
-                th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-                if (IS_ERR(th))
-                        return PTR_ERR(th);
-                ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
-                                               buf + sizeof(*head), end);
-                break;
-        default:
-                return -EINVAL;
-        }
-        if (ret)
-                return ret;
-        if (ac->want_keys == xi->have_keys)
-                return 0;
-        return -EAGAIN;
-}
-static int ceph_x_create_authorizer(
-        struct ceph_auth_client *ac, int peer_type,
-        struct ceph_authorizer **a,
-        void **buf, size_t *len,
-        void **reply_buf, size_t *reply_len)
-{
-        struct ceph_x_authorizer *au;
-        struct ceph_x_ticket_handler *th;
-        int ret;
-        th = get_ticket_handler(ac, peer_type);
-        if (IS_ERR(th))
-                return PTR_ERR(th);
-        au = kzalloc(sizeof(*au), GFP_NOFS);
-        if (!au)
-                return -ENOMEM;
-        ret = ceph_x_build_authorizer(ac, th, au);
-        if (ret) {
-                kfree(au);
-                return ret;
-        }
-        *a = (struct ceph_authorizer *)au;
-        *buf = au->buf->vec.iov_base;
-        *len = au->buf->vec.iov_len;
-        *reply_buf = au->reply_buf;
-        *reply_len = sizeof(au->reply_buf);
-        return 0;
-}
-static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
-                                          struct ceph_authorizer *a, size_t len)
-{
-        struct ceph_x_authorizer *au = (void *)a;
-        struct ceph_x_ticket_handler *th;
-        int ret = 0;
-        struct ceph_x_authorize_reply reply;
-        void *p = au->reply_buf;
-        void *end = p + sizeof(au->reply_buf);
-        th = get_ticket_handler(ac, au->service);
-        if (IS_ERR(th))
-                return PTR_ERR(th);
-        ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
-        if (ret < 0)
-                return ret;
-        if (ret != sizeof(reply))
-                return -EPERM;
-        if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
-                ret = -EPERM;
-        else
-                ret = 0;
-        dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
-             au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
-        return ret;
-}
-static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
-                                      struct ceph_authorizer *a)
-{
-        struct ceph_x_authorizer *au = (void *)a;
-        ceph_buffer_put(au->buf);
-        kfree(au);
-}
-static void ceph_x_reset(struct ceph_auth_client *ac)
-{
-        struct ceph_x_info *xi = ac->private;
-        dout("reset\n");
-        xi->starting = true;
-        xi->server_challenge = 0;
-}
-static void ceph_x_destroy(struct ceph_auth_client *ac)
-{
-        struct ceph_x_info *xi = ac->private;
-        struct rb_node *p;
-        dout("ceph_x_destroy %p\n", ac);
-        ceph_crypto_key_destroy(&xi->secret);
-        while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
-                struct ceph_x_ticket_handler *th =
-                        rb_entry(p, struct ceph_x_ticket_handler, node);
-                remove_ticket_handler(ac, th);
-        }
-        if (xi->auth_authorizer.buf)
-                ceph_buffer_put(xi->auth_authorizer.buf);
-        kfree(ac->private);
-        ac->private = NULL;
-}
-static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
-                                   int peer_type)
-{
-        struct ceph_x_ticket_handler *th;
-        th = get_ticket_handler(ac, peer_type);
-        if (!IS_ERR(th))
-                remove_ticket_handler(ac, th);
-}
-static const struct ceph_auth_client_ops ceph_x_ops = {
-        .name = "x",
-        .is_authenticated = ceph_x_is_authenticated,
-        .should_authenticate = ceph_x_should_authenticate,
-        .build_request = ceph_x_build_request,
-        .handle_reply = ceph_x_handle_reply,
-        .create_authorizer = ceph_x_create_authorizer,
-        .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
-        .destroy_authorizer = ceph_x_destroy_authorizer,
-        .invalidate_authorizer = ceph_x_invalidate_authorizer,
-        .reset =  ceph_x_reset,
-        .destroy = ceph_x_destroy,
-};
-int ceph_x_init(struct ceph_auth_client *ac)
-{
-        struct ceph_x_info *xi;
-        int ret;
-        dout("ceph_x_init %p\n", ac);
-        ret = -ENOMEM;
-        xi = kzalloc(sizeof(*xi), GFP_NOFS);
-        if (!xi)
-                goto out;
-        ret = -EINVAL;
-        if (!ac->secret) {
-                pr_err("no secret set (for auth_x protocol)\n");
-                goto out_nomem;
-        }
-        ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
-        if (ret)
-                goto out_nomem;
-        xi->starting = true;
-        xi->ticket_handlers = RB_ROOT;
-        ac->protocol = CEPH_AUTH_CEPHX;
-        ac->private = xi;
-        ac->ops = &ceph_x_ops;
-        return 0;
-out_nomem:
-        kfree(xi);
-out:
-        return ret;
-}
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
deleted file mode 100644
index ff6f8180e681..000000000000
--- a/fs/ceph/auth_x.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef _FS_CEPH_AUTH_X_H
-#define _FS_CEPH_AUTH_X_H
-#include <linux/rbtree.h>
-#include "crypto.h"
-#include "auth.h"
-#include "auth_x_protocol.h"
-/*
- * Handle ticket for a single service.
- */
-struct ceph_x_ticket_handler {
-        struct rb_node node;
-        unsigned service;
-        struct ceph_crypto_key session_key;
-        struct ceph_timespec validity;
-        u64 secret_id;
-        struct ceph_buffer *ticket_blob;
-        unsigned long renew_after, expires;
-};
-struct ceph_x_authorizer {
-        struct ceph_buffer *buf;
-        unsigned service;
-        u64 nonce;
-        char reply_buf[128];  /* big enough for encrypted blob */
-};
-struct ceph_x_info {
-        struct ceph_crypto_key secret;
-        bool starting;
-        u64 server_challenge;
-        unsigned have_keys;
-        struct rb_root ticket_handlers;
-        struct ceph_x_authorizer auth_authorizer;
-};
-extern int ceph_x_init(struct ceph_auth_client *ac);
-#endif
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
deleted file mode 100644
index 671d30576c4f..000000000000
--- a/fs/ceph/auth_x_protocol.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef __FS_CEPH_AUTH_X_PROTOCOL
-#define __FS_CEPH_AUTH_X_PROTOCOL
-#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
-#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
-#define CEPHX_GET_ROTATING_KEY          0x0400
-/* common bits */
-struct ceph_x_ticket_blob {
-        __u8 struct_v;
-        __le64 secret_id;
-        __le32 blob_len;
-        char blob[];
-} __attribute__ ((packed));
-/* common request/reply headers */
-struct ceph_x_request_header {
-        __le16 op;
-} __attribute__ ((packed));
-struct ceph_x_reply_header {
-        __le16 op;
-        __le32 result;
-} __attribute__ ((packed));
-/* authenticate handshake */
-/* initial hello (no reply header) */
-struct ceph_x_server_challenge {
-        __u8 struct_v;
-        __le64 server_challenge;
-} __attribute__ ((packed));
-struct ceph_x_authenticate {
-        __u8 struct_v;
-        __le64 client_challenge;
-        __le64 key;
-        /* ticket blob */
-} __attribute__ ((packed));
-struct ceph_x_service_ticket_request {
-        __u8 struct_v;
-        __le32 keys;
-} __attribute__ ((packed));
-struct ceph_x_challenge_blob {
-        __le64 server_challenge;
-        __le64 client_challenge;
-} __attribute__ ((packed));
-/* authorize handshake */
-/*
- * The authorizer consists of two pieces:
- *  a - service id, ticket blob
- *  b - encrypted with session key
- */
-struct ceph_x_authorize_a {
-        __u8 struct_v;
-        __le64 global_id;
-        __le32 service_id;
-        struct ceph_x_ticket_blob ticket_blob;
-} __attribute__ ((packed));
-struct ceph_x_authorize_b {
-        __u8 struct_v;
-        __le64 nonce;
-} __attribute__ ((packed));
-struct ceph_x_authorize_reply {
-        __u8 struct_v;
-        __le64 nonce_plus_one;
-} __attribute__ ((packed));
-/*
- * encyption bundle
- */
-#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
-struct ceph_x_encrypt_header {
-        __u8 struct_v;
-        __le64 magic;
-} __attribute__ ((packed));
-#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
deleted file mode 100644
index cd39f17021de..000000000000
--- a/fs/ceph/buffer.c
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/slab.h>
-#include "buffer.h"
-#include "decode.h"
-struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
-{
-        struct ceph_buffer *b;
-        b = kmalloc(sizeof(*b), gfp);
-        if (!b)
-                return NULL;
-        b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
-        if (b->vec.iov_base) {
-                b->is_vmalloc = false;
-        } else {
-                b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
-                if (!b->vec.iov_base) {
-                        kfree(b);
-                        return NULL;
-                }
-                b->is_vmalloc = true;
-        }
-        kref_init(&b->kref);
-        b->alloc_len = len;
-        b->vec.iov_len = len;
-        dout("buffer_new %p\n", b);
-        return b;
-}
-void ceph_buffer_release(struct kref *kref)
-{
-        struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
-        dout("buffer_release %p\n", b);
-        if (b->vec.iov_base) {
-                if (b->is_vmalloc)
-                        vfree(b->vec.iov_base);
-                else
-                        kfree(b->vec.iov_base);
-        }
-        kfree(b);
-}
-int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
-{
-        size_t len;
-        ceph_decode_need(p, end, sizeof(u32), bad);
-        len = ceph_decode_32(p);
-        dout("decode_buffer len %d\n", (int)len);
-        ceph_decode_need(p, end, len, bad);
-        *b = ceph_buffer_new(len, GFP_NOFS);
-        if (!*b)
-                return -ENOMEM;
-        ceph_decode_copy(p, (*b)->vec.iov_base, len);
-        return 0;
-bad:
-        return -EINVAL;
-}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
deleted file mode 100644
index 58d19014068f..000000000000
--- a/fs/ceph/buffer.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __FS_CEPH_BUFFER_H
-#define __FS_CEPH_BUFFER_H
-#include <linux/kref.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-/*
- * a simple reference counted buffer.
- *
- * use kmalloc for small sizes (<= one page), vmalloc for larger
- * sizes.
- */
-struct ceph_buffer {
-        struct kref kref;
-        struct kvec vec;
-        size_t alloc_len;
-        bool is_vmalloc;
-};
-extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
-extern void ceph_buffer_release(struct kref *kref);
-static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
-{
-        kref_get(&b->kref);
-        return b;
-}
-static inline void ceph_buffer_put(struct ceph_buffer *b)
-{
-        kref_put(&b->kref, ceph_buffer_release);
-}
-extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
-#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5e9da996a151..6b61ded701e1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -9,8 +9,9 @@
 #include <linux/writeback.h>
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
-#include "messenger.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
 /*
 * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
        spin_unlock(&mdsc->caps_list_lock);
 }
-void ceph_reservation_status(struct ceph_client *client,
+void ceph_reservation_status(struct ceph_fs_client *fsc,
                             int *total, int *avail, int *used, int *reserved,
                             int *min)
 {
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        if (total)
                *total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
                               struct ceph_inode_info *ci)
 {
-        struct ceph_mount_args *ma = mdsc->client->mount_args;
+        struct ceph_mount_options *ma = mdsc->fsc->mount_options;
        ci->i_hold_caps_min = round_jiffies(jiffies +
                                            ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
                 unsigned seq, unsigned mseq, u64 realmino, int flags,
                 struct ceph_cap_reservation *caps_reservation)
 {
-        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_cap *new_cap = NULL;
        struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
        struct ceph_mds_session *session = cap->session;
        struct ceph_inode_info *ci = cap->ci;
        struct ceph_mds_client *mdsc =
-                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+                ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        int removed = 0;
        dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
        int mds;
        struct ceph_cap_snap *capsnap;
        u32 mseq;
-        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
                                                    session->s_mutex */
        u64 next_follows = 0;  /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
        struct ceph_mds_client *mdsc =
-                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+                ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        struct inode *inode = &ci->vfs_inode;
        int was = ci->i_dirty_caps;
        int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
                                 struct ceph_mds_session *session)
 {
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int flushing;
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
 /*
 * try to invalidate mapping pages without blocking.
 */
-static int mapping_is_empty(struct address_space *mapping)
-{
-        struct page *page = find_get_page(mapping, 0);
-        if (!page)
-                return 1;
-        put_page(page);
-        return 0;
-}
 static int try_nonblocking_invalidate(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,12 +1426,12 @@ static int try_nonblocking_invalidate(struct inode *inode)
        invalidate_mapping_pages(&inode->i_data, 0, -1);
        spin_lock(&inode->i_lock);
-        if (mapping_is_empty(&inode->i_data) &&
+        if (inode->i_data.nrpages == 0 &&
            invalidating_gen == ci->i_rdcache_gen) {
                /* success. */
                dout("try_nonblocking_invalidate %p success\n", inode);
-                ci->i_rdcache_gen = 0;
+                /* save any racing async invalidate some trouble */
-                ci->i_rdcache_revoking = 0;
+                ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
                return 0;
        }
        dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                     struct ceph_mds_session *session)
 {
-        struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap *cap;
        int file_wanted, used;
@@ -1533,7 +1523,7 @@ retry_locked:
         */
        if ((!is_delayed || mdsc->stopping) &&
            ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
-            ci->i_rdcache_gen &&                     /* may have cached pages */
+            inode->i_data.nrpages &&                 /* have cached pages */
            (file_wanted == 0 ||                     /* no open files */
             (revoking & (CEPH_CAP_FILE_CACHE|
                          CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
@@ -1570,9 +1560,10 @@ retry_locked:
                /* NOTE: no side-effects allowed, until we take s_mutex */
                revoking = cap->implemented & ~cap->issued;
-                if (revoking)
+                dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
-                        dout(" mds%d revoking %s\n", cap->mds,
+                     cap->mds, cap, ceph_cap_string(cap->issued),
-                             ceph_cap_string(revoking));
+                     ceph_cap_string(cap->implemented),
+                     ceph_cap_string(revoking));
                if (cap == ci->i_auth_cap &&
                    (cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1668,6 +1659,8 @@ ack:
                if (cap == ci->i_auth_cap && ci->i_dirty_caps)
                        flushing = __mark_caps_flushing(inode, session);
+                else
+                        flushing = 0;
                mds = cap->mds;  /* remember mds, so we don't repeat */
                sent++;
@@ -1706,7 +1699,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
                          unsigned *flush_tid)
 {
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int unlock_session = session ? 0 : 1;
        int flushing = 0;
@@ -1872,7 +1865,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
                                       caps_are_flushed(inode, flush_tid));
        } else {
                struct ceph_mds_client *mdsc =
-                        &ceph_sb_to_client(inode->i_sb)->mdsc;
+                        ceph_sb_to_client(inode->i_sb)->mdsc;
                spin_lock(&inode->i_lock);
                if (__ceph_caps_dirty(ci))
@@ -1950,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
        }
 }
+static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
+                                     struct ceph_mds_session *session,
+                                     struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_cap *cap;
+        int delayed = 0;
+        spin_lock(&inode->i_lock);
+        cap = ci->i_auth_cap;
+        dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
+             ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+        __ceph_flush_snaps(ci, &session, 1);
+        if (ci->i_flushing_caps) {
+                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                                     __ceph_caps_used(ci),
+                                     __ceph_caps_wanted(ci),
+                                     cap->issued | cap->implemented,
+                                     ci->i_flushing_caps, NULL);
+                if (delayed) {
+                        spin_lock(&inode->i_lock);
+                        __cap_delay_requeue(mdsc, ci);
+                        spin_unlock(&inode->i_lock);
+                }
+        } else {
+                spin_unlock(&inode->i_lock);
+        }
+}
 /*
 * Take references to capabilities we hold, so that we don't release
@@ -2283,8 +2305,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
-        unsigned seq = le32_to_cpu(grant->seq);
+        int seq = le32_to_cpu(grant->seq);
-        unsigned issue_seq = le32_to_cpu(grant->issue_seq);
        int newcaps = le32_to_cpu(grant->caps);
        int issued, implemented, used, wanted, dirty;
        u64 size = le64_to_cpu(grant->size);
@@ -2296,8 +2317,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        int revoked_rdcache = 0;
        int queue_invalidate = 0;
-        dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
+        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
-             inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
+             inode, cap, mds, seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
                inode->i_size);
@@ -2393,7 +2414,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        cap->seq = seq;
-        cap->issue_seq = issue_seq;
        /* file layout may have changed */
        ci->i_layout = grant->layout;
@@ -2465,7 +2485,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
        __releases(inode->i_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        unsigned seq = le32_to_cpu(m->seq);
        int dirty = le32_to_cpu(m->dirty);
        int cleaned = 0;
@@ -2699,8 +2719,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
        ceph_add_cap(inode, session, cap_id, -1,
                     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
                     NULL /* no caps context */);
-        try_flush_caps(inode, session, NULL);
+        kick_flushing_inode_caps(mdsc, session, inode);
        up_read(&mdsc->snap_rwsem);
+        /* make sure we re-request max_size, if necessary */
+        spin_lock(&inode->i_lock);
+        ci->i_requested_max_size = 0;
+        spin_unlock(&inode->i_lock);
 }
 /*
@@ -2713,7 +2738,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                      struct ceph_msg *msg)
 {
        struct ceph_mds_client *mdsc = session->s_mdsc;
-        struct super_block *sb = mdsc->client->sb;
+        struct super_block *sb = mdsc->fsc->sb;
        struct inode *inode;
        struct ceph_cap *cap;
        struct ceph_mds_caps *h;
@@ -2792,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        case CEPH_CAP_OP_IMPORT:
                handle_cap_import(mdsc, inode, h, session,
                                  snaptrace, snaptrace_len);
-                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
+                ceph_check_caps(ceph_inode(inode), 0, session);
-                                session);
                goto done_unlocked;
        }
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
deleted file mode 100644
index 1818c2305610..000000000000
--- a/fs/ceph/ceph_debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _FS_CEPH_DEBUG_H
-#define _FS_CEPH_DEBUG_H
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
-/*
- * wrap pr_debug to include a filename:lineno prefix on each line.
- * this incurs some overhead (kernel size and execution time) due to
- * the extra function call at each call site.
- */
-# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
-extern const char *ceph_file_part(const char *s, int len);
-#  define dout(fmt, ...)                                                \
-        pr_debug(" %12.12s:%-4d : " fmt,                                \
-                 ceph_file_part(__FILE__, sizeof(__FILE__)),            \
-                 __LINE__, ##__VA_ARGS__)
-# else
-/* faux printk call just to see any compiler warnings. */
-#  define dout(fmt, ...)        do {                            \
-                if (0)                                          \
-                        printk(KERN_DEBUG fmt, ##__VA_ARGS__);  \
-        } while (0)
-# endif
-#else
-/*
- * or, just wrap pr_debug
- */
-# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
-#endif
-#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c4091..bdce8b1fbd06 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
 /*
 * Ceph 'frag' type
 */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
 int ceph_frag_compare(__u32 a, __u32 b)
 {
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
deleted file mode 100644
index 5babb8e95352..000000000000
--- a/fs/ceph/ceph_frag.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef FS_CEPH_FRAG_H
-#define FS_CEPH_FRAG_H
-/*
- * "Frags" are a way to describe a subset of a 32-bit number space,
- * using a mask and a value to match against that mask.  Any given frag
- * (subset of the number space) can be partitioned into 2^n sub-frags.
- *
- * Frags are encoded into a 32-bit word:
- *   8 upper bits = "bits"
- *  24 lower bits = "value"
- * (We could go to 5+27 bits, but who cares.)
- *
- * We use the _most_ significant bits of the 24 bit value.  This makes
- * values logically sort.
- *
- * Unfortunately, because the "bits" field is still in the high bits, we
- * can't sort encoded frags numerically.  However, it does allow you
- * to feed encoded frags as values into frag_contains_value.
- */
-static inline __u32 ceph_frag_make(__u32 b, __u32 v)
-{
-        return (b << 24) |
-                (v & (0xffffffu << (24-b)) & 0xffffffu);
-}
-static inline __u32 ceph_frag_bits(__u32 f)
-{
-        return f >> 24;
-}
-static inline __u32 ceph_frag_value(__u32 f)
-{
-        return f & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask(__u32 f)
-{
-        return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask_shift(__u32 f)
-{
-        return 24 - ceph_frag_bits(f);
-}
-static inline int ceph_frag_contains_value(__u32 f, __u32 v)
-{
-        return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
-{
-        /* is sub as specific as us, and contained by us? */
-        return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
-               (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-static inline __u32 ceph_frag_parent(__u32 f)
-{
-        return ceph_frag_make(ceph_frag_bits(f) - 1,
-                         ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
-}
-static inline int ceph_frag_is_left_child(__u32 f)
-{
-        return ceph_frag_bits(f) > 0 &&
-                (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
-}
-static inline int ceph_frag_is_right_child(__u32 f)
-{
-        return ceph_frag_bits(f) > 0 &&
-                (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
-}
-static inline __u32 ceph_frag_sibling(__u32 f)
-{
-        return ceph_frag_make(ceph_frag_bits(f),
-                      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
-}
-static inline __u32 ceph_frag_left_child(__u32 f)
-{
-        return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
-}
-static inline __u32 ceph_frag_right_child(__u32 f)
-{
-        return ceph_frag_make(ceph_frag_bits(f)+1,
-              ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
-}
-static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
-{
-        int newbits = ceph_frag_bits(f) + by;
-        return ceph_frag_make(newbits,
-                         ceph_frag_value(f) | (i << (24 - newbits)));
-}
-static inline int ceph_frag_is_leftmost(__u32 f)
-{
-        return ceph_frag_value(f) == 0;
-}
-static inline int ceph_frag_is_rightmost(__u32 f)
-{
-        return ceph_frag_value(f) == ceph_frag_mask(f);
-}
-static inline __u32 ceph_frag_next(__u32 f)
-{
-        return ceph_frag_make(ceph_frag_bits(f),
-                         ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
-}
-/*
- * comparator to sort frags logically, as when traversing the
- * number space in ascending order...
- */
-int ceph_frag_compare(__u32 a, __u32 b);
-#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
deleted file mode 100644
index 3ac6cc7c1156..000000000000
--- a/fs/ceph/ceph_fs.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Some non-inline ceph helpers
- */
-#include "types.h"
-/*
- * return true if @layout appears to be valid
- */
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
-{
-        __u32 su = le32_to_cpu(layout->fl_stripe_unit);
-        __u32 sc = le32_to_cpu(layout->fl_stripe_count);
-        __u32 os = le32_to_cpu(layout->fl_object_size);
-        /* stripe unit, object size must be non-zero, 64k increment */
-        if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
-                return 0;
-        if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
-                return 0;
-        /* object size must be a multiple of stripe unit */
-        if (os < su || os % su)
-                return 0;
-        /* stripe count must be non-zero */
-        if (!sc)
-                return 0;
-        return 1;
-}
-int ceph_flags_to_mode(int flags)
-{
-        int mode;
-#ifdef O_DIRECTORY  /* fixme */
-        if ((flags & O_DIRECTORY) == O_DIRECTORY)
-                return CEPH_FILE_MODE_PIN;
-#endif
-        if ((flags & O_APPEND) == O_APPEND)
-                flags |= O_WRONLY;
-        if ((flags & O_ACCMODE) == O_RDWR)
-                mode = CEPH_FILE_MODE_RDWR;
-        else if ((flags & O_ACCMODE) == O_WRONLY)
-                mode = CEPH_FILE_MODE_WR;
-        else
-                mode = CEPH_FILE_MODE_RD;
-#ifdef O_LAZY
-        if (flags & O_LAZY)
-                mode |= CEPH_FILE_MODE_LAZY;
-#endif
-        return mode;
-}
-int ceph_caps_for_mode(int mode)
-{
-        int caps = CEPH_CAP_PIN;
-        if (mode & CEPH_FILE_MODE_RD)
-                caps |= CEPH_CAP_FILE_SHARED |
-                        CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
-        if (mode & CEPH_FILE_MODE_WR)
-                caps |= CEPH_CAP_FILE_EXCL |
-                        CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
-                        CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
-                        CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
-        if (mode & CEPH_FILE_MODE_LAZY)
-                caps |= CEPH_CAP_FILE_LAZYIO;
-        return caps;
-}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
deleted file mode 100644
index d5619ac86711..000000000000
--- a/fs/ceph/ceph_fs.h
+++ /dev/null
@@ -1,728 +0,0 @@
-/*
- * ceph_fs.h - Ceph constants and data types to share between kernel and
- * user space.
- *
- * Most types in this file are defined as little-endian, and are
- * primarily intended to describe data structures that pass over the
- * wire or that are stored on disk.
- *
- * LGPL2
- */
-#ifndef CEPH_FS_H
-#define CEPH_FS_H
-#include "msgr.h"
-#include "rados.h"
-/*
- * subprotocol versions.  when specific messages types or high-level
- * protocols change, bump the affected components.  we keep rev
- * internal cluster protocols separately from the public,
- * client-facing protocol.
- */
-#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
-#define CEPH_MON_PROTOCOL     5 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL   24 /* server/client */
-#define CEPH_MDSC_PROTOCOL   32 /* server/client */
-#define CEPH_MONC_PROTOCOL   15 /* server/client */
-#define CEPH_INO_ROOT  1
-#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
-/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
-#define CEPH_MAX_MON   31
-/*
- * feature bits
- */
-#define CEPH_FEATURE_UID            (1<<0)
-#define CEPH_FEATURE_NOSRCADDR      (1<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
-#define CEPH_FEATURE_FLOCK          (1<<3)
-/*
- * ceph_file_layout - describe data layout for a file/inode
- */
-struct ceph_file_layout {
-        /* file -> object mapping */
-        __le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
-                                      of page size. */
-        __le32 fl_stripe_count;    /* over this many objects */
-        __le32 fl_object_size;     /* until objects are this big, then move to
-                                      new objects */
-        __le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
-        /* pg -> disk layout */
-        __le32 fl_object_stripe_unit;  /* for per-object parity, if any */
-        /* object -> pg layout */
-        __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
-        __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
-} __attribute__ ((packed));
-#define CEPH_MIN_STRIPE_UNIT 65536
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
-/* crypto algorithms */
-#define CEPH_CRYPTO_NONE 0x0
-#define CEPH_CRYPTO_AES  0x1
-#define CEPH_AES_IV "cephsageyudagreg"
-/* security/authentication protocols */
-#define CEPH_AUTH_UNKNOWN       0x0
-#define CEPH_AUTH_NONE          0x1
-#define CEPH_AUTH_CEPHX         0x2
-#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
-/*********************************************
- * message layer
- */
-/*
- * message types
- */
-/* misc */
-#define CEPH_MSG_SHUTDOWN               1
-#define CEPH_MSG_PING                   2
-/* client <-> monitor */
-#define CEPH_MSG_MON_MAP                4
-#define CEPH_MSG_MON_GET_MAP            5
-#define CEPH_MSG_STATFS                 13
-#define CEPH_MSG_STATFS_REPLY           14
-#define CEPH_MSG_MON_SUBSCRIBE          15
-#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
-#define CEPH_MSG_AUTH                   17
-#define CEPH_MSG_AUTH_REPLY             18
-/* client <-> mds */
-#define CEPH_MSG_MDS_MAP                21
-#define CEPH_MSG_CLIENT_SESSION         22
-#define CEPH_MSG_CLIENT_RECONNECT       23
-#define CEPH_MSG_CLIENT_REQUEST         24
-#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
-#define CEPH_MSG_CLIENT_REPLY           26
-#define CEPH_MSG_CLIENT_CAPS            0x310
-#define CEPH_MSG_CLIENT_LEASE           0x311
-#define CEPH_MSG_CLIENT_SNAP            0x312
-#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
-/* pool ops */
-#define CEPH_MSG_POOLOP_REPLY           48
-#define CEPH_MSG_POOLOP                 49
-/* osd */
-#define CEPH_MSG_OSD_MAP          41
-#define CEPH_MSG_OSD_OP           42
-#define CEPH_MSG_OSD_OPREPLY      43
-/* pool operations */
-enum {
-  POOL_OP_CREATE                        = 0x01,
-  POOL_OP_DELETE                        = 0x02,
-  POOL_OP_AUID_CHANGE                   = 0x03,
-  POOL_OP_CREATE_SNAP                   = 0x11,
-  POOL_OP_DELETE_SNAP                   = 0x12,
-  POOL_OP_CREATE_UNMANAGED_SNAP         = 0x21,
-  POOL_OP_DELETE_UNMANAGED_SNAP         = 0x22,
-};
-struct ceph_mon_request_header {
-        __le64 have_version;
-        __le16 session_mon;
-        __le64 session_mon_tid;
-} __attribute__ ((packed));
-struct ceph_mon_statfs {
-        struct ceph_mon_request_header monhdr;
-        struct ceph_fsid fsid;
-} __attribute__ ((packed));
-struct ceph_statfs {
-        __le64 kb, kb_used, kb_avail;
-        __le64 num_objects;
-} __attribute__ ((packed));
-struct ceph_mon_statfs_reply {
-        struct ceph_fsid fsid;
-        __le64 version;
-        struct ceph_statfs st;
-} __attribute__ ((packed));
-const char *ceph_pool_op_name(int op);
-struct ceph_mon_poolop {
-        struct ceph_mon_request_header monhdr;
-        struct ceph_fsid fsid;
-        __le32 pool;
-        __le32 op;
-        __le64 auid;
-        __le64 snapid;
-        __le32 name_len;
-} __attribute__ ((packed));
-struct ceph_mon_poolop_reply {
-        struct ceph_mon_request_header monhdr;
-        struct ceph_fsid fsid;
-        __le32 reply_code;
-        __le32 epoch;
-        char has_data;
-        char data[0];
-} __attribute__ ((packed));
-struct ceph_mon_unmanaged_snap {
-        __le64 snapid;
-} __attribute__ ((packed));
-struct ceph_osd_getmap {
-        struct ceph_mon_request_header monhdr;
-        struct ceph_fsid fsid;
-        __le32 start;
-} __attribute__ ((packed));
-struct ceph_mds_getmap {
-        struct ceph_mon_request_header monhdr;
-        struct ceph_fsid fsid;
-} __attribute__ ((packed));
-struct ceph_client_mount {
-        struct ceph_mon_request_header monhdr;
-} __attribute__ ((packed));
-struct ceph_mon_subscribe_item {
-        __le64 have_version;    __le64 have;
-        __u8 onetime;
-} __attribute__ ((packed));
-struct ceph_mon_subscribe_ack {
-        __le32 duration;         /* seconds */
-        struct ceph_fsid fsid;
-} __attribute__ ((packed));
-/*
- * mds states
- *   > 0 -> in
- *  <= 0 -> out
- */
-#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
-#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
-                                          empty log. */
-#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
-#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
-#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
-#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
-#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
-#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
-#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
-                                          operations (import, rename, etc.) */
-#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
-#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
-#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
-#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
-#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
-extern const char *ceph_mds_state_name(int s);
-/*
- * metadata lock types.
- *  - these are bitmasks.. we can compose them
- *  - they also define the lock ordering by the MDS
- *  - a few of these are internal to the mds
- */
-#define CEPH_LOCK_DVERSION    1
-#define CEPH_LOCK_DN          2
-#define CEPH_LOCK_ISNAP       16
-#define CEPH_LOCK_IVERSION    32    /* mds internal */
-#define CEPH_LOCK_IFILE       64
-#define CEPH_LOCK_IAUTH       128
-#define CEPH_LOCK_ILINK       256
-#define CEPH_LOCK_IDFT        512   /* dir frag tree */
-#define CEPH_LOCK_INEST       1024  /* mds internal */
-#define CEPH_LOCK_IXATTR      2048
-#define CEPH_LOCK_IFLOCK      4096  /* advisory file locks */
-#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
-/* client_session ops */
-enum {
-        CEPH_SESSION_REQUEST_OPEN,
-        CEPH_SESSION_OPEN,
-        CEPH_SESSION_REQUEST_CLOSE,
-        CEPH_SESSION_CLOSE,
-        CEPH_SESSION_REQUEST_RENEWCAPS,
-        CEPH_SESSION_RENEWCAPS,
-        CEPH_SESSION_STALE,
-        CEPH_SESSION_RECALL_STATE,
-};
-extern const char *ceph_session_op_name(int op);
-struct ceph_mds_session_head {
-        __le32 op;
-        __le64 seq;
-        struct ceph_timespec stamp;
-        __le32 max_caps, max_leases;
-} __attribute__ ((packed));
-/* client_request */
-/*
- * metadata ops.
- *  & 0x001000 -> write op
- *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
- &  & 0x100000 -> use weird ino/path trace
- */
-#define CEPH_MDS_OP_WRITE        0x001000
-enum {
-        CEPH_MDS_OP_LOOKUP     = 0x00100,
-        CEPH_MDS_OP_GETATTR    = 0x00101,
-        CEPH_MDS_OP_LOOKUPHASH = 0x00102,
-        CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
-        CEPH_MDS_OP_SETXATTR   = 0x01105,
-        CEPH_MDS_OP_RMXATTR    = 0x01106,
-        CEPH_MDS_OP_SETLAYOUT  = 0x01107,
-        CEPH_MDS_OP_SETATTR    = 0x01108,
-        CEPH_MDS_OP_SETFILELOCK= 0x01109,
-        CEPH_MDS_OP_GETFILELOCK= 0x00110,
-        CEPH_MDS_OP_MKNOD      = 0x01201,
-        CEPH_MDS_OP_LINK       = 0x01202,
-        CEPH_MDS_OP_UNLINK     = 0x01203,
-        CEPH_MDS_OP_RENAME     = 0x01204,
-        CEPH_MDS_OP_MKDIR      = 0x01220,
-        CEPH_MDS_OP_RMDIR      = 0x01221,
-        CEPH_MDS_OP_SYMLINK    = 0x01222,
-        CEPH_MDS_OP_CREATE     = 0x01301,
-        CEPH_MDS_OP_OPEN       = 0x00302,
-        CEPH_MDS_OP_READDIR    = 0x00305,
-        CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
-        CEPH_MDS_OP_MKSNAP     = 0x01400,
-        CEPH_MDS_OP_RMSNAP     = 0x01401,
-        CEPH_MDS_OP_LSSNAP     = 0x00402,
-};
-extern const char *ceph_mds_op_name(int op);
-#define CEPH_SETATTR_MODE   1
-#define CEPH_SETATTR_UID    2
-#define CEPH_SETATTR_GID    4
-#define CEPH_SETATTR_MTIME  8
-#define CEPH_SETATTR_ATIME 16
-#define CEPH_SETATTR_SIZE  32
-#define CEPH_SETATTR_CTIME 64
-union ceph_mds_request_args {
-        struct {
-                __le32 mask;                 /* CEPH_CAP_* */
-        } __attribute__ ((packed)) getattr;
-        struct {
-                __le32 mode;
-                __le32 uid;
-                __le32 gid;
-                struct ceph_timespec mtime;
-                struct ceph_timespec atime;
-                __le64 size, old_size;       /* old_size needed by truncate */
-                __le32 mask;                 /* CEPH_SETATTR_* */
-        } __attribute__ ((packed)) setattr;
-        struct {
-                __le32 frag;                 /* which dir fragment */
-                __le32 max_entries;          /* how many dentries to grab */
-                __le32 max_bytes;
-        } __attribute__ ((packed)) readdir;
-        struct {
-                __le32 mode;
-                __le32 rdev;
-        } __attribute__ ((packed)) mknod;
-        struct {
-                __le32 mode;
-        } __attribute__ ((packed)) mkdir;
-        struct {
-                __le32 flags;
-                __le32 mode;
-                __le32 stripe_unit;          /* layout for newly created file */
-                __le32 stripe_count;         /* ... */
-                __le32 object_size;
-                __le32 file_replication;
-                __le32 preferred;
-        } __attribute__ ((packed)) open;
-        struct {
-                __le32 flags;
-        } __attribute__ ((packed)) setxattr;
-        struct {
-                struct ceph_file_layout layout;
-        } __attribute__ ((packed)) setlayout;
-        struct {
-                __u8 rule; /* currently fcntl or flock */
-                __u8 type; /* shared, exclusive, remove*/
-                __le64 pid; /* process id requesting the lock */
-                __le64 pid_namespace;
-                __le64 start; /* initial location to lock */
-                __le64 length; /* num bytes to lock from start */
-                __u8 wait; /* will caller wait for lock to become available? */
-        } __attribute__ ((packed)) filelock_change;
-} __attribute__ ((packed));
-#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
-#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
-struct ceph_mds_request_head {
-        __le64 oldest_client_tid;
-        __le32 mdsmap_epoch;           /* on client */
-        __le32 flags;                  /* CEPH_MDS_FLAG_* */
-        __u8 num_retry, num_fwd;       /* count retry, fwd attempts */
-        __le16 num_releases;           /* # include cap/lease release records */
-        __le32 op;                     /* mds op code */
-        __le32 caller_uid, caller_gid;
-        __le64 ino;                    /* use this ino for openc, mkdir, mknod,
-                                          etc. (if replaying) */
-        union ceph_mds_request_args args;
-} __attribute__ ((packed));
-/* cap/lease release record */
-struct ceph_mds_request_release {
-        __le64 ino, cap_id;            /* ino and unique cap id */
-        __le32 caps, wanted;           /* new issued, wanted */
-        __le32 seq, issue_seq, mseq;
-        __le32 dname_seq;              /* if releasing a dentry lease, a */
-        __le32 dname_len;              /* string follows. */
-} __attribute__ ((packed));
-/* client reply */
-struct ceph_mds_reply_head {
-        __le32 op;
-        __le32 result;
-        __le32 mdsmap_epoch;
-        __u8 safe;                     /* true if committed to disk */
-        __u8 is_dentry, is_target;     /* true if dentry, target inode records
-                                          are included with reply */
-} __attribute__ ((packed));
-/* one for each node split */
-struct ceph_frag_tree_split {
-        __le32 frag;                   /* this frag splits... */
-        __le32 by;                     /* ...by this many bits */
-} __attribute__ ((packed));
-struct ceph_frag_tree_head {
-        __le32 nsplits;                /* num ceph_frag_tree_split records */
-        struct ceph_frag_tree_split splits[];
-} __attribute__ ((packed));
-/* capability issue, for bundling with mds reply */
-struct ceph_mds_reply_cap {
-        __le32 caps, wanted;           /* caps issued, wanted */
-        __le64 cap_id;
-        __le32 seq, mseq;
-        __le64 realm;                  /* snap realm */
-        __u8 flags;                    /* CEPH_CAP_FLAG_* */
-} __attribute__ ((packed));
-#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
-/* inode record, for bundling with mds reply */
-struct ceph_mds_reply_inode {
-        __le64 ino;
-        __le64 snapid;
-        __le32 rdev;
-        __le64 version;                /* inode version */
-        __le64 xattr_version;          /* version for xattr blob */
-        struct ceph_mds_reply_cap cap; /* caps issued for this inode */
-        struct ceph_file_layout layout;
-        struct ceph_timespec ctime, mtime, atime;
-        __le32 time_warp_seq;
-        __le64 size, max_size, truncate_size;
-        __le32 truncate_seq;
-        __le32 mode, uid, gid;
-        __le32 nlink;
-        __le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
-        struct ceph_timespec rctime;
-        struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
-} __attribute__ ((packed));
-/* followed by frag array, then symlink string, then xattr blob */
-/* reply_lease follows dname, and reply_inode */
-struct ceph_mds_reply_lease {
-        __le16 mask;            /* lease type(s) */
-        __le32 duration_ms;     /* lease duration */
-        __le32 seq;
-} __attribute__ ((packed));
-struct ceph_mds_reply_dirfrag {
-        __le32 frag;            /* fragment */
-        __le32 auth;            /* auth mds, if this is a delegation point */
-        __le32 ndist;           /* number of mds' this is replicated on */
-        __le32 dist[];
-} __attribute__ ((packed));
-#define CEPH_LOCK_FCNTL    1
-#define CEPH_LOCK_FLOCK    2
-#define CEPH_LOCK_SHARED   1
-#define CEPH_LOCK_EXCL     2
-#define CEPH_LOCK_UNLOCK   4
-struct ceph_filelock {
-        __le64 start;/* file offset to start lock at */
-        __le64 length; /* num bytes to lock; 0 for all following start */
-        __le64 client; /* which client holds the lock */
-        __le64 pid; /* process id holding the lock on the client */
-        __le64 pid_namespace;
-        __u8 type; /* shared lock, exclusive lock, or unlock */
-} __attribute__ ((packed));
-/* file access modes */
-#define CEPH_FILE_MODE_PIN        0
-#define CEPH_FILE_MODE_RD         1
-#define CEPH_FILE_MODE_WR         2
-#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
-#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
-#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
-int ceph_flags_to_mode(int flags);
-/* capability bits */
-#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
-/* generic cap bits */
-#define CEPH_CAP_GSHARED     1  /* client can reads */
-#define CEPH_CAP_GEXCL       2  /* client can read and update */
-#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
-#define CEPH_CAP_GRD         8  /* (file) client can read */
-#define CEPH_CAP_GWR        16  /* (file) client can write */
-#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
-#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
-#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
-/* per-lock shift */
-#define CEPH_CAP_SAUTH      2
-#define CEPH_CAP_SLINK      4
-#define CEPH_CAP_SXATTR     6
-#define CEPH_CAP_SFILE      8
-#define CEPH_CAP_SFLOCK    20 
-#define CEPH_CAP_BITS       22
-/* composed values */
-#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
-#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
-#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
-#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
-#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
-#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
-#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FLOCK_SHARED  (CEPH_CAP_GSHARED   << CEPH_CAP_SFLOCK)
-#define CEPH_CAP_FLOCK_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SFLOCK)
-/* cap masks (for getattr) */
-#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
-#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
-#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
-#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
-#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
-#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
-#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |                 \
-                                 CEPH_CAP_AUTH_SHARED | \
-                                 CEPH_CAP_LINK_SHARED | \
-                                 CEPH_CAP_FILE_SHARED | \
-                                 CEPH_CAP_XATTR_SHARED)
-#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |                     \
-                              CEPH_CAP_LINK_SHARED |                    \
-                              CEPH_CAP_XATTR_SHARED |                   \
-                              CEPH_CAP_FILE_SHARED)
-#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |     \
-                           CEPH_CAP_FILE_CACHE)
-#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |         \
-                           CEPH_CAP_LINK_EXCL |         \
-                           CEPH_CAP_XATTR_EXCL |        \
-                           CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
-                              CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
-#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
-                           CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
-                           CEPH_CAP_PIN)
-#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
-                        CEPH_LOCK_IXATTR)
-int ceph_caps_for_mode(int mode);
-enum {
-        CEPH_CAP_OP_GRANT,         /* mds->client grant */
-        CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
-        CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
-        CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
-        CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
-        CEPH_CAP_OP_UPDATE,        /* client->mds update */
-        CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
-        CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
-        CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
-        CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
-        CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
-        CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
-        CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
-};
-extern const char *ceph_cap_op_name(int op);
-/*
- * caps message, used for capability callbacks, acks, requests, etc.
- */
-struct ceph_mds_caps {
-        __le32 op;                  /* CEPH_CAP_OP_* */
-        __le64 ino, realm;
-        __le64 cap_id;
-        __le32 seq, issue_seq;
-        __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
-        __le32 migrate_seq;
-        __le64 snap_follows;
-        __le32 snap_trace_len;
-        /* authlock */
-        __le32 uid, gid, mode;
-        /* linklock */
-        __le32 nlink;
-        /* xattrlock */
-        __le32 xattr_len;
-        __le64 xattr_version;
-        /* filelock */
-        __le64 size, max_size, truncate_size;
-        __le32 truncate_seq;
-        struct ceph_timespec mtime, atime, ctime;
-        struct ceph_file_layout layout;
-        __le32 time_warp_seq;
-} __attribute__ ((packed));
-/* cap release msg head */
-struct ceph_mds_cap_release {
-        __le32 num;                /* number of cap_items that follow */
-} __attribute__ ((packed));
-struct ceph_mds_cap_item {
-        __le64 ino;
-        __le64 cap_id;
-        __le32 migrate_seq, seq;
-} __attribute__ ((packed));
-#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
-#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
-#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
-#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
-extern const char *ceph_lease_op_name(int o);
-/* lease msg header */
-struct ceph_mds_lease {
-        __u8 action;            /* CEPH_MDS_LEASE_* */
-        __le16 mask;            /* which lease */
-        __le64 ino;
-        __le64 first, last;     /* snap range */
-        __le32 seq;
-        __le32 duration_ms;     /* duration of renewal */
-} __attribute__ ((packed));
-/* followed by a __le32+string for dname */
-/* client reconnect */
-struct ceph_mds_cap_reconnect {
-        __le64 cap_id;
-        __le32 wanted;
-        __le32 issued;
-        __le64 snaprealm;
-        __le64 pathbase;        /* base ino for our path to this ino */
-        __le32 flock_len;       /* size of flock state blob, if any */
-} __attribute__ ((packed));
-/* followed by flock blob */
-struct ceph_mds_cap_reconnect_v1 {
-        __le64 cap_id;
-        __le32 wanted;
-        __le32 issued;
-        __le64 size;
-        struct ceph_timespec mtime, atime;
-        __le64 snaprealm;
-        __le64 pathbase;        /* base ino for our path to this ino */
-} __attribute__ ((packed));
-struct ceph_mds_snaprealm_reconnect {
-        __le64 ino;     /* snap realm base */
-        __le64 seq;     /* snap seq for this snap realm */
-        __le64 parent;  /* parent realm */
-} __attribute__ ((packed));
-/*
- * snaps
- */
-enum {
-        CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
-        CEPH_SNAP_OP_CREATE,
-        CEPH_SNAP_OP_DESTROY,
-        CEPH_SNAP_OP_SPLIT,
-};
-extern const char *ceph_snap_op_name(int o);
-/* snap msg header */
-struct ceph_mds_snap_head {
-        __le32 op;                /* CEPH_SNAP_OP_* */
-        __le64 split;             /* ino to split off, if any */
-        __le32 num_split_inos;    /* # inos belonging to new child realm */
-        __le32 num_split_realms;  /* # child realms udner new child realm */
-        __le32 trace_len;         /* size of snap trace blob */
-} __attribute__ ((packed));
-/* followed by split ino list, then split realms, then the trace blob */
-/*
- * encode info about a snaprealm, as viewed by a client
- */
-struct ceph_mds_snap_realm {
-        __le64 ino;           /* ino */
-        __le64 created;       /* snap: when created */
-        __le64 parent;        /* ino: parent realm */
-        __le64 parent_since;  /* snap: same parent since */
-        __le64 seq;           /* snap: version */
-        __le32 num_snaps;
-        __le32 num_prior_parent_snaps;
-} __attribute__ ((packed));
-/* followed by my snap list, then prior parent snap list */
-#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
deleted file mode 100644
index bd570015d147..000000000000
--- a/fs/ceph/ceph_hash.c
+++ /dev/null
@@ -1,118 +0,0 @@
-#include "types.h"
-/*
- * Robert Jenkin's hash function.
- * http://burtleburtle.net/bob/hash/evahash.html
- * This is in the public domain.
- */
-#define mix(a, b, c)                                            \
-        do {                                                    \
-                a = a - b;  a = a - c;  a = a ^ (c >> 13);      \
-                b = b - c;  b = b - a;  b = b ^ (a << 8);       \
-                c = c - a;  c = c - b;  c = c ^ (b >> 13);      \
-                a = a - b;  a = a - c;  a = a ^ (c >> 12);      \
-                b = b - c;  b = b - a;  b = b ^ (a << 16);      \
-                c = c - a;  c = c - b;  c = c ^ (b >> 5);       \
-                a = a - b;  a = a - c;  a = a ^ (c >> 3);       \
-                b = b - c;  b = b - a;  b = b ^ (a << 10);      \
-                c = c - a;  c = c - b;  c = c ^ (b >> 15);      \
-        } while (0)
-unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
-{
-        const unsigned char *k = (const unsigned char *)str;
-        __u32 a, b, c;  /* the internal state */
-        __u32 len;      /* how many key bytes still need mixing */
-        /* Set up the internal state */
-        len = length;
-        a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
-        b = a;
-        c = 0;               /* variable initialization of internal state */
-        /* handle most of the key */
-        while (len >= 12) {
-                a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
-                         ((__u32)k[3] << 24));
-                b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
-                         ((__u32)k[7] << 24));
-                c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
-                         ((__u32)k[11] << 24));
-                mix(a, b, c);
-                k = k + 12;
-                len = len - 12;
-        }
-        /* handle the last 11 bytes */
-        c = c + length;
-        switch (len) {            /* all the case statements fall through */
-        case 11:
-                c = c + ((__u32)k[10] << 24);
-        case 10:
-                c = c + ((__u32)k[9] << 16);
-        case 9:
-                c = c + ((__u32)k[8] << 8);
-                /* the first byte of c is reserved for the length */
-        case 8:
-                b = b + ((__u32)k[7] << 24);
-        case 7:
-                b = b + ((__u32)k[6] << 16);
-        case 6:
-                b = b + ((__u32)k[5] << 8);
-        case 5:
-                b = b + k[4];
-        case 4:
-                a = a + ((__u32)k[3] << 24);
-        case 3:
-                a = a + ((__u32)k[2] << 16);
-        case 2:
-                a = a + ((__u32)k[1] << 8);
-        case 1:
-                a = a + k[0];
-                /* case 0: nothing left to add */
-        }
-        mix(a, b, c);
-        return c;
-}
-/*
- * linux dcache hash
- */
-unsigned ceph_str_hash_linux(const char *str, unsigned length)
-{
-        unsigned long hash = 0;
-        unsigned char c;
-        while (length--) {
-                c = *str++;
-                hash = (hash + (c << 4) + (c >> 4)) * 11;
-        }
-        return hash;
-}
-unsigned ceph_str_hash(int type, const char *s, unsigned len)
-{
-        switch (type) {
-        case CEPH_STR_HASH_LINUX:
-                return ceph_str_hash_linux(s, len);
-        case CEPH_STR_HASH_RJENKINS:
-                return ceph_str_hash_rjenkins(s, len);
-        default:
-                return -1;
-        }
-}
-const char *ceph_str_hash_name(int type)
-{
-        switch (type) {
-        case CEPH_STR_HASH_LINUX:
-                return "linux";
-        case CEPH_STR_HASH_RJENKINS:
-                return "rjenkins";
-        default:
-                return "unknown";
-        }
-}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
deleted file mode 100644
index d099c3f90236..000000000000
--- a/fs/ceph/ceph_hash.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef FS_CEPH_HASH_H
-#define FS_CEPH_HASH_H
-#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
-#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
-extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
-extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
-extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
-extern const char *ceph_str_hash_name(int type);
-#endif
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
deleted file mode 100644
index fabd302e5779..000000000000
--- a/fs/ceph/crush/crush.c
+++ /dev/null
@@ -1,151 +0,0 @@
-#ifdef __KERNEL__
-# include <linux/slab.h>
-#else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
-#endif
-#include "crush.h"
-const char *crush_bucket_alg_name(int alg)
-{
-        switch (alg) {
-        case CRUSH_BUCKET_UNIFORM: return "uniform";
-        case CRUSH_BUCKET_LIST: return "list";
-        case CRUSH_BUCKET_TREE: return "tree";
-        case CRUSH_BUCKET_STRAW: return "straw";
-        default: return "unknown";
-        }
-}
-/**
- * crush_get_bucket_item_weight - Get weight of an item in given bucket
- * @b: bucket pointer
- * @p: item index in bucket
- */
-int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
-{
-        if (p >= b->size)
-                return 0;
-        switch (b->alg) {
-        case CRUSH_BUCKET_UNIFORM:
-                return ((struct crush_bucket_uniform *)b)->item_weight;
-        case CRUSH_BUCKET_LIST:
-                return ((struct crush_bucket_list *)b)->item_weights[p];
-        case CRUSH_BUCKET_TREE:
-                if (p & 1)
-                        return ((struct crush_bucket_tree *)b)->node_weights[p];
-                return 0;
-        case CRUSH_BUCKET_STRAW:
-                return ((struct crush_bucket_straw *)b)->item_weights[p];
-        }
-        return 0;
-}
-/**
- * crush_calc_parents - Calculate parent vectors for the given crush map.
- * @map: crush_map pointer
- */
-void crush_calc_parents(struct crush_map *map)
-{
-        int i, b, c;
-        for (b = 0; b < map->max_buckets; b++) {
-                if (map->buckets[b] == NULL)
-                        continue;
-                for (i = 0; i < map->buckets[b]->size; i++) {
-                        c = map->buckets[b]->items[i];
-                        BUG_ON(c >= map->max_devices ||
-                               c < -map->max_buckets);
-                        if (c >= 0)
-                                map->device_parents[c] = map->buckets[b]->id;
-                        else
-                                map->bucket_parents[-1-c] = map->buckets[b]->id;
-                }
-        }
-}
-void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
-{
-        kfree(b->h.perm);
-        kfree(b->h.items);
-        kfree(b);
-}
-void crush_destroy_bucket_list(struct crush_bucket_list *b)
-{
-        kfree(b->item_weights);
-        kfree(b->sum_weights);
-        kfree(b->h.perm);
-        kfree(b->h.items);
-        kfree(b);
-}
-void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
-{
-        kfree(b->node_weights);
-        kfree(b);
-}
-void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
-{
-        kfree(b->straws);
-        kfree(b->item_weights);
-        kfree(b->h.perm);
-        kfree(b->h.items);
-        kfree(b);
-}
-void crush_destroy_bucket(struct crush_bucket *b)
-{
-        switch (b->alg) {
-        case CRUSH_BUCKET_UNIFORM:
-                crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
-                break;
-        case CRUSH_BUCKET_LIST:
-                crush_destroy_bucket_list((struct crush_bucket_list *)b);
-                break;
-        case CRUSH_BUCKET_TREE:
-                crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
-                break;
-        case CRUSH_BUCKET_STRAW:
-                crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
-                break;
-        }
-}
-/**
- * crush_destroy - Destroy a crush_map
- * @map: crush_map pointer
- */
-void crush_destroy(struct crush_map *map)
-{
-        int b;
-        /* buckets */
-        if (map->buckets) {
-                for (b = 0; b < map->max_buckets; b++) {
-                        if (map->buckets[b] == NULL)
-                                continue;
-                        crush_destroy_bucket(map->buckets[b]);
-                }
-                kfree(map->buckets);
-        }
-        /* rules */
-        if (map->rules) {
-                for (b = 0; b < map->max_rules; b++)
-                        kfree(map->rules[b]);
-                kfree(map->rules);
-        }
-        kfree(map->bucket_parents);
-        kfree(map->device_parents);
-        kfree(map);
-}
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
deleted file mode 100644
index 97e435b191f4..000000000000
--- a/fs/ceph/crush/crush.h
+++ /dev/null
@@ -1,180 +0,0 @@
-#ifndef CEPH_CRUSH_CRUSH_H
-#define CEPH_CRUSH_CRUSH_H
-#include <linux/types.h>
-/*
- * CRUSH is a pseudo-random data distribution algorithm that
- * efficiently distributes input values (typically, data objects)
- * across a heterogeneous, structured storage cluster.
- *
- * The algorithm was originally described in detail in this paper
- * (although the algorithm has evolved somewhat since then):
- *
- *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
- *
- * LGPL2
- */
-#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
-#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
-#define CRUSH_MAX_SET   10  /* max size of a mapping result */
-/*
- * CRUSH uses user-defined "rules" to describe how inputs should be
- * mapped to devices.  A rule consists of sequence of steps to perform
- * to generate the set of output devices.
- */
-struct crush_rule_step {
-        __u32 op;
-        __s32 arg1;
-        __s32 arg2;
-};
-/* step op codes */
-enum {
-        CRUSH_RULE_NOOP = 0,
-        CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
-        CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
-                                      /* arg2 = type */
-        CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
-        CRUSH_RULE_EMIT = 4,          /* no args */
-        CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
-        CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
-};
-/*
- * for specifying choose num (arg1) relative to the max parameter
- * passed to do_rule
- */
-#define CRUSH_CHOOSE_N            0
-#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
-/*
- * The rule mask is used to describe what the rule is intended for.
- * Given a ruleset and size of output set, we search through the
- * rule list for a matching rule_mask.
- */
-struct crush_rule_mask {
-        __u8 ruleset;
-        __u8 type;
-        __u8 min_size;
-        __u8 max_size;
-};
-struct crush_rule {
-        __u32 len;
-        struct crush_rule_mask mask;
-        struct crush_rule_step steps[0];
-};
-#define crush_rule_size(len) (sizeof(struct crush_rule) + \
-                              (len)*sizeof(struct crush_rule_step))
-/*
- * A bucket is a named container of other items (either devices or
- * other buckets).  Items within a bucket are chosen using one of a
- * few different algorithms.  The table summarizes how the speed of
- * each option measures up against mapping stability when items are
- * added or removed.
- *
- *  Bucket Alg     Speed       Additions    Removals
- *  ------------------------------------------------
- *  uniform         O(1)       poor         poor
- *  list            O(n)       optimal      poor
- *  tree            O(log n)   good         good
- *  straw           O(n)       optimal      optimal
- */
-enum {
-        CRUSH_BUCKET_UNIFORM = 1,
-        CRUSH_BUCKET_LIST = 2,
-        CRUSH_BUCKET_TREE = 3,
-        CRUSH_BUCKET_STRAW = 4
-};
-extern const char *crush_bucket_alg_name(int alg);
-struct crush_bucket {
-        __s32 id;        /* this'll be negative */
-        __u16 type;      /* non-zero; type=0 is reserved for devices */
-        __u8 alg;        /* one of CRUSH_BUCKET_* */
-        __u8 hash;       /* which hash function to use, CRUSH_HASH_* */
-        __u32 weight;    /* 16-bit fixed point */
-        __u32 size;      /* num items */
-        __s32 *items;
-        /*
-         * cached random permutation: used for uniform bucket and for
-         * the linear search fallback for the other bucket types.
-         */
-        __u32 perm_x;  /* @x for which *perm is defined */
-        __u32 perm_n;  /* num elements of *perm that are permuted/defined */
-        __u32 *perm;
-};
-struct crush_bucket_uniform {
-        struct crush_bucket h;
-        __u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
-};
-struct crush_bucket_list {
-        struct crush_bucket h;
-        __u32 *item_weights;  /* 16-bit fixed point */
-        __u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
-                                 of weights 0..i, inclusive */
-};
-struct crush_bucket_tree {
-        struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
-                                   actual items */
-        __u8 num_nodes;
-        __u32 *node_weights;
-};
-struct crush_bucket_straw {
-        struct crush_bucket h;
-        __u32 *item_weights;   /* 16-bit fixed point */
-        __u32 *straws;         /* 16-bit fixed point */
-};
-/*
- * CRUSH map includes all buckets, rules, etc.
- */
-struct crush_map {
-        struct crush_bucket **buckets;
-        struct crush_rule **rules;
-        /*
-         * Parent pointers to identify the parent bucket a device or
-         * bucket in the hierarchy.  If an item appears more than
-         * once, this is the _last_ time it appeared (where buckets
-         * are processed in bucket id order, from -1 on down to
-         * -max_buckets.
-         */
-        __u32 *bucket_parents;
-        __u32 *device_parents;
-        __s32 max_buckets;
-        __u32 max_rules;
-        __s32 max_devices;
-};
-/* crush.c */
-extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
-extern void crush_calc_parents(struct crush_map *map);
-extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
-extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
-extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
-extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
-extern void crush_destroy_bucket(struct crush_bucket *b);
-extern void crush_destroy(struct crush_map *map);
-#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
deleted file mode 100644
index 5873aed694bf..000000000000
--- a/fs/ceph/crush/hash.c
+++ /dev/null
@@ -1,149 +0,0 @@
-#include <linux/types.h>
-#include "hash.h"
-/*
- * Robert Jenkins' function for mixing 32-bit values
- * http://burtleburtle.net/bob/hash/evahash.html
- * a, b = random bits, c = input and output
- */
-#define crush_hashmix(a, b, c) do {                     \
-                a = a-b;  a = a-c;  a = a^(c>>13);      \
-                b = b-c;  b = b-a;  b = b^(a<<8);       \
-                c = c-a;  c = c-b;  c = c^(b>>13);      \
-                a = a-b;  a = a-c;  a = a^(c>>12);      \
-                b = b-c;  b = b-a;  b = b^(a<<16);      \
-                c = c-a;  c = c-b;  c = c^(b>>5);       \
-                a = a-b;  a = a-c;  a = a^(c>>3);       \
-                b = b-c;  b = b-a;  b = b^(a<<10);      \
-                c = c-a;  c = c-b;  c = c^(b>>15);      \
-        } while (0)
-#define crush_hash_seed 1315423911
-static __u32 crush_hash32_rjenkins1(__u32 a)
-{
-        __u32 hash = crush_hash_seed ^ a;
-        __u32 b = a;
-        __u32 x = 231232;
-        __u32 y = 1232;
-        crush_hashmix(b, x, hash);
-        crush_hashmix(y, a, hash);
-        return hash;
-}
-static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
-{
-        __u32 hash = crush_hash_seed ^ a ^ b;
-        __u32 x = 231232;
-        __u32 y = 1232;
-        crush_hashmix(a, b, hash);
-        crush_hashmix(x, a, hash);
-        crush_hashmix(b, y, hash);
-        return hash;
-}
-static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
-{
-        __u32 hash = crush_hash_seed ^ a ^ b ^ c;
-        __u32 x = 231232;
-        __u32 y = 1232;
-        crush_hashmix(a, b, hash);
-        crush_hashmix(c, x, hash);
-        crush_hashmix(y, a, hash);
-        crush_hashmix(b, x, hash);
-        crush_hashmix(y, c, hash);
-        return hash;
-}
-static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
-{
-        __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
-        __u32 x = 231232;
-        __u32 y = 1232;
-        crush_hashmix(a, b, hash);
-        crush_hashmix(c, d, hash);
-        crush_hashmix(a, x, hash);
-        crush_hashmix(y, b, hash);
-        crush_hashmix(c, x, hash);
-        crush_hashmix(y, d, hash);
-        return hash;
-}
-static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
-                                      __u32 e)
-{
-        __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
-        __u32 x = 231232;
-        __u32 y = 1232;
-        crush_hashmix(a, b, hash);
-        crush_hashmix(c, d, hash);
-        crush_hashmix(e, x, hash);
-        crush_hashmix(y, a, hash);
-        crush_hashmix(b, x, hash);
-        crush_hashmix(y, c, hash);
-        crush_hashmix(d, x, hash);
-        crush_hashmix(y, e, hash);
-        return hash;
-}
-__u32 crush_hash32(int type, __u32 a)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return crush_hash32_rjenkins1(a);
-        default:
-                return 0;
-        }
-}
-__u32 crush_hash32_2(int type, __u32 a, __u32 b)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return crush_hash32_rjenkins1_2(a, b);
-        default:
-                return 0;
-        }
-}
-__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return crush_hash32_rjenkins1_3(a, b, c);
-        default:
-                return 0;
-        }
-}
-__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return crush_hash32_rjenkins1_4(a, b, c, d);
-        default:
-                return 0;
-        }
-}
-__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return crush_hash32_rjenkins1_5(a, b, c, d, e);
-        default:
-                return 0;
-        }
-}
-const char *crush_hash_name(int type)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return "rjenkins1";
-        default:
-                return "unknown";
-        }
-}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
deleted file mode 100644
index 91e884230d5d..000000000000
--- a/fs/ceph/crush/hash.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef CEPH_CRUSH_HASH_H
-#define CEPH_CRUSH_HASH_H
-#define CRUSH_HASH_RJENKINS1   0
-#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
-extern const char *crush_hash_name(int type);
-extern __u32 crush_hash32(int type, __u32 a);
-extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
-extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
-extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
-extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
-                            __u32 e);
-#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
deleted file mode 100644
index a4eec133258e..000000000000
--- a/fs/ceph/crush/mapper.c
+++ /dev/null
@@ -1,609 +0,0 @@
-#ifdef __KERNEL__
-# include <linux/string.h>
-# include <linux/slab.h>
-# include <linux/bug.h>
-# include <linux/kernel.h>
-# ifndef dprintk
-#  define dprintk(args...)
-# endif
-#else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
-#endif
-#include "crush.h"
-#include "hash.h"
-/*
- * Implement the core CRUSH mapping algorithm.
- */
-/**
- * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
- * @map: the crush_map
- * @ruleset: the storage ruleset id (user defined)
- * @type: storage ruleset type (user defined)
- * @size: output set size
- */
-int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
-{
-        int i;
-        for (i = 0; i < map->max_rules; i++) {
-                if (map->rules[i] &&
-                    map->rules[i]->mask.ruleset == ruleset &&
-                    map->rules[i]->mask.type == type &&
-                    map->rules[i]->mask.min_size <= size &&
-                    map->rules[i]->mask.max_size >= size)
-                        return i;
-        }
-        return -1;
-}
-/*
- * bucket choose methods
- *
- * For each bucket algorithm, we have a "choose" method that, given a
- * crush input @x and replica position (usually, position in output set) @r,
- * will produce an item in the bucket.
- */
-/*
- * Choose based on a random permutation of the bucket.
- *
- * We used to use some prime number arithmetic to do this, but it
- * wasn't very random, and had some other bad behaviors.  Instead, we
- * calculate an actual random permutation of the bucket members.
- * Since this is expensive, we optimize for the r=0 case, which
- * captures the vast majority of calls.
- */
-static int bucket_perm_choose(struct crush_bucket *bucket,
-                              int x, int r)
-{
-        unsigned pr = r % bucket->size;
-        unsigned i, s;
-        /* start a new permutation if @x has changed */
-        if (bucket->perm_x != x || bucket->perm_n == 0) {
-                dprintk("bucket %d new x=%d\n", bucket->id, x);
-                bucket->perm_x = x;
-                /* optimize common r=0 case */
-                if (pr == 0) {
-                        s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
-                                bucket->size;
-                        bucket->perm[0] = s;
-                        bucket->perm_n = 0xffff;   /* magic value, see below */
-                        goto out;
-                }
-                for (i = 0; i < bucket->size; i++)
-                        bucket->perm[i] = i;
-                bucket->perm_n = 0;
-        } else if (bucket->perm_n == 0xffff) {
-                /* clean up after the r=0 case above */
-                for (i = 1; i < bucket->size; i++)
-                        bucket->perm[i] = i;
-                bucket->perm[bucket->perm[0]] = 0;
-                bucket->perm_n = 1;
-        }
-        /* calculate permutation up to pr */
-        for (i = 0; i < bucket->perm_n; i++)
-                dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
-        while (bucket->perm_n <= pr) {
-                unsigned p = bucket->perm_n;
-                /* no point in swapping the final entry */
-                if (p < bucket->size - 1) {
-                        i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
-                                (bucket->size - p);
-                        if (i) {
-                                unsigned t = bucket->perm[p + i];
-                                bucket->perm[p + i] = bucket->perm[p];
-                                bucket->perm[p] = t;
-                        }
-                        dprintk(" perm_choose swap %d with %d\n", p, p+i);
-                }
-                bucket->perm_n++;
-        }
-        for (i = 0; i < bucket->size; i++)
-                dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
-        s = bucket->perm[pr];
-out:
-        dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
-                bucket->size, x, r, pr, s);
-        return bucket->items[s];
-}
-/* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
-                                 int x, int r)
-{
-        return bucket_perm_choose(&bucket->h, x, r);
-}
-/* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
-                              int x, int r)
-{
-        int i;
-        for (i = bucket->h.size-1; i >= 0; i--) {
-                __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
-                                         r, bucket->h.id);
-                w &= 0xffff;
-                dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
-                        "sw %x rand %llx",
-                        i, x, r, bucket->h.items[i], bucket->item_weights[i],
-                        bucket->sum_weights[i], w);
-                w *= bucket->sum_weights[i];
-                w = w >> 16;
-                /*dprintk(" scaled %llx\n", w);*/
-                if (w < bucket->item_weights[i])
-                        return bucket->h.items[i];
-        }
-        BUG_ON(1);
-        return 0;
-}
-/* (binary) tree */
-static int height(int n)
-{
-        int h = 0;
-        while ((n & 1) == 0) {
-                h++;
-                n = n >> 1;
-        }
-        return h;
-}
-static int left(int x)
-{
-        int h = height(x);
-        return x - (1 << (h-1));
-}
-static int right(int x)
-{
-        int h = height(x);
-        return x + (1 << (h-1));
-}
-static int terminal(int x)
-{
-        return x & 1;
-}
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
-                              int x, int r)
-{
-        int n, l;
-        __u32 w;
-        __u64 t;
-        /* start at root */
-        n = bucket->num_nodes >> 1;
-        while (!terminal(n)) {
-                /* pick point in [0, w) */
-                w = bucket->node_weights[n];
-                t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
-                                          bucket->h.id) * (__u64)w;
-                t = t >> 32;
-                /* descend to the left or right? */
-                l = left(n);
-                if (t < bucket->node_weights[l])
-                        n = l;
-                else
-                        n = right(n);
-        }
-        return bucket->h.items[n >> 1];
-}
-/* straw */
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
-                               int x, int r)
-{
-        int i;
-        int high = 0;
-        __u64 high_draw = 0;
-        __u64 draw;
-        for (i = 0; i < bucket->h.size; i++) {
-                draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
-                draw &= 0xffff;
-                draw *= bucket->straws[i];
-                if (i == 0 || draw > high_draw) {
-                        high = i;
-                        high_draw = draw;
-                }
-        }
-        return bucket->h.items[high];
-}
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
-{
-        dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
-        switch (in->alg) {
-        case CRUSH_BUCKET_UNIFORM:
-                return bucket_uniform_choose((struct crush_bucket_uniform *)in,
-                                          x, r);
-        case CRUSH_BUCKET_LIST:
-                return bucket_list_choose((struct crush_bucket_list *)in,
-                                          x, r);
-        case CRUSH_BUCKET_TREE:
-                return bucket_tree_choose((struct crush_bucket_tree *)in,
-                                          x, r);
-        case CRUSH_BUCKET_STRAW:
-                return bucket_straw_choose((struct crush_bucket_straw *)in,
-                                           x, r);
-        default:
-                BUG_ON(1);
-                return in->items[0];
-        }
-}
-/*
- * true if device is marked "out" (failed, fully offloaded)
- * of the cluster
- */
-static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
-{
-        if (weight[item] >= 0x10000)
-                return 0;
-        if (weight[item] == 0)
-                return 1;
-        if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
-            < weight[item])
-                return 0;
-        return 1;
-}
-/**
- * crush_choose - choose numrep distinct items of given type
- * @map: the crush_map
- * @bucket: the bucket we are choose an item from
- * @x: crush input value
- * @numrep: the number of items to choose
- * @type: the type of item to choose
- * @out: pointer to output vector
- * @outpos: our position in that vector
- * @firstn: true if choosing "first n" items, false if choosing "indep"
- * @recurse_to_leaf: true if we want one device under each item of given type
- * @out2: second output vector for leaf items (if @recurse_to_leaf)
- */
-static int crush_choose(struct crush_map *map,
-                        struct crush_bucket *bucket,
-                        __u32 *weight,
-                        int x, int numrep, int type,
-                        int *out, int outpos,
-                        int firstn, int recurse_to_leaf,
-                        int *out2)
-{
-        int rep;
-        int ftotal, flocal;
-        int retry_descent, retry_bucket, skip_rep;
-        struct crush_bucket *in = bucket;
-        int r;
-        int i;
-        int item = 0;
-        int itemtype;
-        int collide, reject;
-        const int orig_tries = 5; /* attempts before we fall back to search */
-        dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
-                bucket->id, x, outpos, numrep);
-        for (rep = outpos; rep < numrep; rep++) {
-                /* keep trying until we get a non-out, non-colliding item */
-                ftotal = 0;
-                skip_rep = 0;
-                do {
-                        retry_descent = 0;
-                        in = bucket;               /* initial bucket */
-                        /* choose through intervening buckets */
-                        flocal = 0;
-                        do {
-                                collide = 0;
-                                retry_bucket = 0;
-                                r = rep;
-                                if (in->alg == CRUSH_BUCKET_UNIFORM) {
-                                        /* be careful */
-                                        if (firstn || numrep >= in->size)
-                                                /* r' = r + f_total */
-                                                r += ftotal;
-                                        else if (in->size % numrep == 0)
-                                                /* r'=r+(n+1)*f_local */
-                                                r += (numrep+1) *
-                                                        (flocal+ftotal);
-                                        else
-                                                /* r' = r + n*f_local */
-                                                r += numrep * (flocal+ftotal);
-                                } else {
-                                        if (firstn)
-                                                /* r' = r + f_total */
-                                                r += ftotal;
-                                        else
-                                                /* r' = r + n*f_local */
-                                                r += numrep * (flocal+ftotal);
-                                }
-                                /* bucket choose */
-                                if (in->size == 0) {
-                                        reject = 1;
-                                        goto reject;
-                                }
-                                if (flocal >= (in->size>>1) &&
-                                    flocal > orig_tries)
-                                        item = bucket_perm_choose(in, x, r);
-                                else
-                                        item = crush_bucket_choose(in, x, r);
-                                BUG_ON(item >= map->max_devices);
-                                /* desired type? */
-                                if (item < 0)
-                                        itemtype = map->buckets[-1-item]->type;
-                                else
-                                        itemtype = 0;
-                                dprintk("  item %d type %d\n", item, itemtype);
-                                /* keep going? */
-                                if (itemtype != type) {
-                                        BUG_ON(item >= 0 ||
-                                               (-1-item) >= map->max_buckets);
-                                        in = map->buckets[-1-item];
-                                        retry_bucket = 1;
-                                        continue;
-                                }
-                                /* collision? */
-                                for (i = 0; i < outpos; i++) {
-                                        if (out[i] == item) {
-                                                collide = 1;
-                                                break;
-                                        }
-                                }
-                                reject = 0;
-                                if (recurse_to_leaf) {
-                                        if (item < 0) {
-                                                if (crush_choose(map,
-                                                         map->buckets[-1-item],
-                                                         weight,
-                                                         x, outpos+1, 0,
-                                                         out2, outpos,
-                                                         firstn, 0,
-                                                         NULL) <= outpos)
-                                                        /* didn't get leaf */
-                                                        reject = 1;
-                                        } else {
-                                                /* we already have a leaf! */
-                                                out2[outpos] = item;
-                                        }
-                                }
-                                if (!reject) {
-                                        /* out? */
-                                        if (itemtype == 0)
-                                                reject = is_out(map, weight,
-                                                                item, x);
-                                        else
-                                                reject = 0;
-                                }
-reject:
-                                if (reject || collide) {
-                                        ftotal++;
-                                        flocal++;
-                                        if (collide && flocal < 3)
-                                                /* retry locally a few times */
-                                                retry_bucket = 1;
-                                        else if (flocal < in->size + orig_tries)
-                                                /* exhaustive bucket search */
-                                                retry_bucket = 1;
-                                        else if (ftotal < 20)
-                                                /* then retry descent */
-                                                retry_descent = 1;
-                                        else
-                                                /* else give up */
-                                                skip_rep = 1;
-                                        dprintk("  reject %d  collide %d  "
-                                                "ftotal %d  flocal %d\n",
-                                                reject, collide, ftotal,
-                                                flocal);
-                                }
-                        } while (retry_bucket);
-                } while (retry_descent);
-                if (skip_rep) {
-                        dprintk("skip rep\n");
-                        continue;
-                }
-                dprintk("CHOOSE got %d\n", item);
-                out[outpos] = item;
-                outpos++;
-        }
-        dprintk("CHOOSE returns %d\n", outpos);
-        return outpos;
-}
-/**
- * crush_do_rule - calculate a mapping with the given input and rule
- * @map: the crush_map
- * @ruleno: the rule id
- * @x: hash input
- * @result: pointer to result vector
- * @result_max: maximum result size
- * @force: force initial replica choice; -1 for none
- */
-int crush_do_rule(struct crush_map *map,
-                  int ruleno, int x, int *result, int result_max,
-                  int force, __u32 *weight)
-{
-        int result_len;
-        int force_context[CRUSH_MAX_DEPTH];
-        int force_pos = -1;
-        int a[CRUSH_MAX_SET];
-        int b[CRUSH_MAX_SET];
-        int c[CRUSH_MAX_SET];
-        int recurse_to_leaf;
-        int *w;
-        int wsize = 0;
-        int *o;
-        int osize;
-        int *tmp;
-        struct crush_rule *rule;
-        int step;
-        int i, j;
-        int numrep;
-        int firstn;
-        int rc = -1;
-        BUG_ON(ruleno >= map->max_rules);
-        rule = map->rules[ruleno];
-        result_len = 0;
-        w = a;
-        o = b;
-        /*
-         * determine hierarchical context of force, if any.  note
-         * that this may or may not correspond to the specific types
-         * referenced by the crush rule.
-         */
-        if (force >= 0) {
-                if (force >= map->max_devices ||
-                    map->device_parents[force] == 0) {
-                        /*dprintk("CRUSH: forcefed device dne\n");*/
-                        rc = -1;  /* force fed device dne */
-                        goto out;
-                }
-                if (!is_out(map, weight, force, x)) {
-                        while (1) {
-                                force_context[++force_pos] = force;
-                                if (force >= 0)
-                                        force = map->device_parents[force];
-                                else
-                                        force = map->bucket_parents[-1-force];
-                                if (force == 0)
-                                        break;
-                        }
-                }
-        }
-        for (step = 0; step < rule->len; step++) {
-                firstn = 0;
-                switch (rule->steps[step].op) {
-                case CRUSH_RULE_TAKE:
-                        w[0] = rule->steps[step].arg1;
-                        if (force_pos >= 0) {
-                                BUG_ON(force_context[force_pos] != w[0]);
-                                force_pos--;
-                        }
-                        wsize = 1;
-                        break;
-                case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
-                case CRUSH_RULE_CHOOSE_FIRSTN:
-                        firstn = 1;
-                case CRUSH_RULE_CHOOSE_LEAF_INDEP:
-                case CRUSH_RULE_CHOOSE_INDEP:
-                        BUG_ON(wsize == 0);
-                        recurse_to_leaf =
-                                rule->steps[step].op ==
-                                 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
-                                rule->steps[step].op ==
-                                CRUSH_RULE_CHOOSE_LEAF_INDEP;
-                        /* reset output */
-                        osize = 0;
-                        for (i = 0; i < wsize; i++) {
-                                /*
-                                 * see CRUSH_N, CRUSH_N_MINUS macros.
-                                 * basically, numrep <= 0 means relative to
-                                 * the provided result_max
-                                 */
-                                numrep = rule->steps[step].arg1;
-                                if (numrep <= 0) {
-                                        numrep += result_max;
-                                        if (numrep <= 0)
-                                                continue;
-                                }
-                                j = 0;
-                                if (osize == 0 && force_pos >= 0) {
-                                        /* skip any intermediate types */
-                                        while (force_pos &&
-                                               force_context[force_pos] < 0 &&
-                                               rule->steps[step].arg2 !=
-                                               map->buckets[-1 -
-                                               force_context[force_pos]]->type)
-                                                force_pos--;
-                                        o[osize] = force_context[force_pos];
-                                        if (recurse_to_leaf)
-                                                c[osize] = force_context[0];
-                                        j++;
-                                        force_pos--;
-                                }
-                                osize += crush_choose(map,
-                                                      map->buckets[-1-w[i]],
-                                                      weight,
-                                                      x, numrep,
-                                                      rule->steps[step].arg2,
-                                                      o+osize, j,
-                                                      firstn,
-                                                      recurse_to_leaf, c+osize);
-                        }
-                        if (recurse_to_leaf)
-                                /* copy final _leaf_ values to output set */
-                                memcpy(o, c, osize*sizeof(*o));
-                        /* swap t and w arrays */
-                        tmp = o;
-                        o = w;
-                        w = tmp;
-                        wsize = osize;
-                        break;
-                case CRUSH_RULE_EMIT:
-                        for (i = 0; i < wsize && result_len < result_max; i++) {
-                                result[result_len] = w[i];
-                                result_len++;
-                        }
-                        wsize = 0;
-                        break;
-                default:
-                        BUG_ON(1);
-                }
-        }
-        rc = result_len;
-out:
-        return rc;
-}
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
deleted file mode 100644
index c46b99c18bb0..000000000000
--- a/fs/ceph/crush/mapper.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef CEPH_CRUSH_MAPPER_H
-#define CEPH_CRUSH_MAPPER_H
-/*
- * CRUSH functions for find rules and then mapping an input to an
- * output set.
- *
- * LGPL2
- */
-#include "crush.h"
-extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
-extern int crush_do_rule(struct crush_map *map,
-                         int ruleno,
-                         int x, int *result, int result_max,
-                         int forcefeed,    /* -1 for none */
-                         __u32 *weights);
-#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
deleted file mode 100644
index a3e627f63293..000000000000
--- a/fs/ceph/crypto.c
+++ /dev/null
@@ -1,412 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/err.h>
-#include <linux/scatterlist.h>
-#include <linux/slab.h>
-#include <crypto/hash.h>
-#include "crypto.h"
-#include "decode.h"
-int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
-{
-        if (*p + sizeof(u16) + sizeof(key->created) +
-            sizeof(u16) + key->len > end)
-                return -ERANGE;
-        ceph_encode_16(p, key->type);
-        ceph_encode_copy(p, &key->created, sizeof(key->created));
-        ceph_encode_16(p, key->len);
-        ceph_encode_copy(p, key->key, key->len);
-        return 0;
-}
-int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
-{
-        ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
-        key->type = ceph_decode_16(p);
-        ceph_decode_copy(p, &key->created, sizeof(key->created));
-        key->len = ceph_decode_16(p);
-        ceph_decode_need(p, end, key->len, bad);
-        key->key = kmalloc(key->len, GFP_NOFS);
-        if (!key->key)
-                return -ENOMEM;
-        ceph_decode_copy(p, key->key, key->len);
-        return 0;
-bad:
-        dout("failed to decode crypto key\n");
-        return -EINVAL;
-}
-int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
-{
-        int inlen = strlen(inkey);
-        int blen = inlen * 3 / 4;
-        void *buf, *p;
-        int ret;
-        dout("crypto_key_unarmor %s\n", inkey);
-        buf = kmalloc(blen, GFP_NOFS);
-        if (!buf)
-                return -ENOMEM;
-        blen = ceph_unarmor(buf, inkey, inkey+inlen);
-        if (blen < 0) {
-                kfree(buf);
-                return blen;
-        }
-        p = buf;
-        ret = ceph_crypto_key_decode(key, &p, p + blen);
-        kfree(buf);
-        if (ret)
-                return ret;
-        dout("crypto_key_unarmor key %p type %d len %d\n", key,
-             key->type, key->len);
-        return 0;
-}
-#define AES_KEY_SIZE 16
-static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
-{
-        return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
-}
-static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
-static int ceph_aes_encrypt(const void *key, int key_len,
-                            void *dst, size_t *dst_len,
-                            const void *src, size_t src_len)
-{
-        struct scatterlist sg_in[2], sg_out[1];
-        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-        struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
-        int ret;
-        void *iv;
-        int ivsize;
-        size_t zero_padding = (0x10 - (src_len & 0x0f));
-        char pad[16];
-        if (IS_ERR(tfm))
-                return PTR_ERR(tfm);
-        memset(pad, zero_padding, zero_padding);
-        *dst_len = src_len + zero_padding;
-        crypto_blkcipher_setkey((void *)tfm, key, key_len);
-        sg_init_table(sg_in, 2);
-        sg_set_buf(&sg_in[0], src, src_len);
-        sg_set_buf(&sg_in[1], pad, zero_padding);
-        sg_init_table(sg_out, 1);
-        sg_set_buf(sg_out, dst, *dst_len);
-        iv = crypto_blkcipher_crt(tfm)->iv;
-        ivsize = crypto_blkcipher_ivsize(tfm);
-        memcpy(iv, aes_iv, ivsize);
-        /*
-        print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
-                       key, key_len, 1);
-        print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
-                        src, src_len, 1);
-        print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
-                        pad, zero_padding, 1);
-        */
-        ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
-                                     src_len + zero_padding);
-        crypto_free_blkcipher(tfm);
-        if (ret < 0)
-                pr_err("ceph_aes_crypt failed %d\n", ret);
-        /*
-        print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
-                       dst, *dst_len, 1);
-        */
-        return 0;
-}
-static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
-                             size_t *dst_len,
-                             const void *src1, size_t src1_len,
-                             const void *src2, size_t src2_len)
-{
-        struct scatterlist sg_in[3], sg_out[1];
-        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-        struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
-        int ret;
-        void *iv;
-        int ivsize;
-        size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
-        char pad[16];
-        if (IS_ERR(tfm))
-                return PTR_ERR(tfm);
-        memset(pad, zero_padding, zero_padding);
-        *dst_len = src1_len + src2_len + zero_padding;
-        crypto_blkcipher_setkey((void *)tfm, key, key_len);
-        sg_init_table(sg_in, 3);
-        sg_set_buf(&sg_in[0], src1, src1_len);
-        sg_set_buf(&sg_in[1], src2, src2_len);
-        sg_set_buf(&sg_in[2], pad, zero_padding);
-        sg_init_table(sg_out, 1);
-        sg_set_buf(sg_out, dst, *dst_len);
-        iv = crypto_blkcipher_crt(tfm)->iv;
-        ivsize = crypto_blkcipher_ivsize(tfm);
-        memcpy(iv, aes_iv, ivsize);
-        /*
-        print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
-                       key, key_len, 1);
-        print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
-                        src1, src1_len, 1);
-        print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
-                        src2, src2_len, 1);
-        print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
-                        pad, zero_padding, 1);
-        */
-        ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
-                                     src1_len + src2_len + zero_padding);
-        crypto_free_blkcipher(tfm);
-        if (ret < 0)
-                pr_err("ceph_aes_crypt2 failed %d\n", ret);
-        /*
-        print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
-                       dst, *dst_len, 1);
-        */
-        return 0;
-}
-static int ceph_aes_decrypt(const void *key, int key_len,
-                            void *dst, size_t *dst_len,
-                            const void *src, size_t src_len)
-{
-        struct scatterlist sg_in[1], sg_out[2];
-        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-        struct blkcipher_desc desc = { .tfm = tfm };
-        char pad[16];
-        void *iv;
-        int ivsize;
-        int ret;
-        int last_byte;
-        if (IS_ERR(tfm))
-                return PTR_ERR(tfm);
-        crypto_blkcipher_setkey((void *)tfm, key, key_len);
-        sg_init_table(sg_in, 1);
-        sg_init_table(sg_out, 2);
-        sg_set_buf(sg_in, src, src_len);
-        sg_set_buf(&sg_out[0], dst, *dst_len);
-        sg_set_buf(&sg_out[1], pad, sizeof(pad));
-        iv = crypto_blkcipher_crt(tfm)->iv;
-        ivsize = crypto_blkcipher_ivsize(tfm);
-        memcpy(iv, aes_iv, ivsize);
-        /*
-        print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
-                       key, key_len, 1);
-        print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
-                       src, src_len, 1);
-        */
-        ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-        crypto_free_blkcipher(tfm);
-        if (ret < 0) {
-                pr_err("ceph_aes_decrypt failed %d\n", ret);
-                return ret;
-        }
-        if (src_len <= *dst_len)
-                last_byte = ((char *)dst)[src_len - 1];
-        else
-                last_byte = pad[src_len - *dst_len - 1];
-        if (last_byte <= 16 && src_len >= last_byte) {
-                *dst_len = src_len - last_byte;
-        } else {
-                pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
-                       last_byte, (int)src_len);
-                return -EPERM;  /* bad padding */
-        }
-        /*
-        print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
-                       dst, *dst_len, 1);
-        */
-        return 0;
-}
-static int ceph_aes_decrypt2(const void *key, int key_len,
-                             void *dst1, size_t *dst1_len,
-                             void *dst2, size_t *dst2_len,
-                             const void *src, size_t src_len)
-{
-        struct scatterlist sg_in[1], sg_out[3];
-        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-        struct blkcipher_desc desc = { .tfm = tfm };
-        char pad[16];
-        void *iv;
-        int ivsize;
-        int ret;
-        int last_byte;
-        if (IS_ERR(tfm))
-                return PTR_ERR(tfm);
-        sg_init_table(sg_in, 1);
-        sg_set_buf(sg_in, src, src_len);
-        sg_init_table(sg_out, 3);
-        sg_set_buf(&sg_out[0], dst1, *dst1_len);
-        sg_set_buf(&sg_out[1], dst2, *dst2_len);
-        sg_set_buf(&sg_out[2], pad, sizeof(pad));
-        crypto_blkcipher_setkey((void *)tfm, key, key_len);
-        iv = crypto_blkcipher_crt(tfm)->iv;
-        ivsize = crypto_blkcipher_ivsize(tfm);
-        memcpy(iv, aes_iv, ivsize);
-        /*
-        print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
-                       key, key_len, 1);
-        print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
-                       src, src_len, 1);
-        */
-        ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-        crypto_free_blkcipher(tfm);
-        if (ret < 0) {
-                pr_err("ceph_aes_decrypt failed %d\n", ret);
-                return ret;
-        }
-        if (src_len <= *dst1_len)
-                last_byte = ((char *)dst1)[src_len - 1];
-        else if (src_len <= *dst1_len + *dst2_len)
-                last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
-        else
-                last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
-        if (last_byte <= 16 && src_len >= last_byte) {
-                src_len -= last_byte;
-        } else {
-                pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
-                       last_byte, (int)src_len);
-                return -EPERM;  /* bad padding */
-        }
-        if (src_len < *dst1_len) {
-                *dst1_len = src_len;
-                *dst2_len = 0;
-        } else {
-                *dst2_len = src_len - *dst1_len;
-        }
-        /*
-        print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
-                       dst1, *dst1_len, 1);
-        print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
-                       dst2, *dst2_len, 1);
-        */
-        return 0;
-}
-int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-                 const void *src, size_t src_len)
-{
-        switch (secret->type) {
-        case CEPH_CRYPTO_NONE:
-                if (*dst_len < src_len)
-                        return -ERANGE;
-                memcpy(dst, src, src_len);
-                *dst_len = src_len;
-                return 0;
-        case CEPH_CRYPTO_AES:
-                return ceph_aes_decrypt(secret->key, secret->len, dst,
-                                        dst_len, src, src_len);
-        default:
-                return -EINVAL;
-        }
-}
-int ceph_decrypt2(struct ceph_crypto_key *secret,
-                        void *dst1, size_t *dst1_len,
-                        void *dst2, size_t *dst2_len,
-                        const void *src, size_t src_len)
-{
-        size_t t;
-        switch (secret->type) {
-        case CEPH_CRYPTO_NONE:
-                if (*dst1_len + *dst2_len < src_len)
-                        return -ERANGE;
-                t = min(*dst1_len, src_len);
-                memcpy(dst1, src, t);
-                *dst1_len = t;
-                src += t;
-                src_len -= t;
-                if (src_len) {
-                        t = min(*dst2_len, src_len);
-                        memcpy(dst2, src, t);
-                        *dst2_len = t;
-                }
-                return 0;
-        case CEPH_CRYPTO_AES:
-                return ceph_aes_decrypt2(secret->key, secret->len,
-                                         dst1, dst1_len, dst2, dst2_len,
-                                         src, src_len);
-        default:
-                return -EINVAL;
-        }
-}
-int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-                 const void *src, size_t src_len)
-{
-        switch (secret->type) {
-        case CEPH_CRYPTO_NONE:
-                if (*dst_len < src_len)
-                        return -ERANGE;
-                memcpy(dst, src, src_len);
-                *dst_len = src_len;
-                return 0;
-        case CEPH_CRYPTO_AES:
-                return ceph_aes_encrypt(secret->key, secret->len, dst,
-                                        dst_len, src, src_len);
-        default:
-                return -EINVAL;
-        }
-}
-int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-                  const void *src1, size_t src1_len,
-                  const void *src2, size_t src2_len)
-{
-        switch (secret->type) {
-        case CEPH_CRYPTO_NONE:
-                if (*dst_len < src1_len + src2_len)
-                        return -ERANGE;
-                memcpy(dst, src1, src1_len);
-                memcpy(dst + src1_len, src2, src2_len);
-                *dst_len = src1_len + src2_len;
-                return 0;
-        case CEPH_CRYPTO_AES:
-                return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
-                                         src1, src1_len, src2, src2_len);
-        default:
-                return -EINVAL;
-        }
-}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
deleted file mode 100644
index bdf38607323c..000000000000
--- a/fs/ceph/crypto.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _FS_CEPH_CRYPTO_H
-#define _FS_CEPH_CRYPTO_H
-#include "types.h"
-#include "buffer.h"
-/*
- * cryptographic secret
- */
-struct ceph_crypto_key {
-        int type;
-        struct ceph_timespec created;
-        int len;
-        void *key;
-};
-static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
-{
-        kfree(key->key);
-}
-extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
-                                  void **p, void *end);
-extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
-                                  void **p, void *end);
-extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
-/* crypto.c */
-extern int ceph_decrypt(struct ceph_crypto_key *secret,
-                        void *dst, size_t *dst_len,
-                        const void *src, size_t src_len);
-extern int ceph_encrypt(struct ceph_crypto_key *secret,
-                        void *dst, size_t *dst_len,
-                        const void *src, size_t src_len);
-extern int ceph_decrypt2(struct ceph_crypto_key *secret,
-                        void *dst1, size_t *dst1_len,
-                        void *dst2, size_t *dst2_len,
-                        const void *src, size_t src_len);
-extern int ceph_encrypt2(struct ceph_crypto_key *secret,
-                         void *dst, size_t *dst_len,
-                         const void *src1, size_t src1_len,
-                         const void *src2, size_t src2_len);
-/* armor.c */
-extern int ceph_armor(char *dst, const char *src, const char *end);
-extern int ceph_unarmor(char *dst, const char *src, const char *end);
-#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6fd8b20a8611..08f65faac112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/device.h>
 #include <linux/slab.h>
@@ -7,143 +7,49 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 #include "super.h"
-#include "mds_client.h"
-#include "mon_client.h"
-#include "auth.h"
 #ifdef CONFIG_DEBUG_FS
-/*
+#include "mds_client.h"
- * Implement /sys/kernel/debug/ceph fun
- *
- * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
- *      .../osdmap      - current osdmap
- *      .../mdsmap      - current mdsmap
- *      .../monmap      - current monmap
- *      .../osdc        - active osd requests
- *      .../mdsc        - active mds requests
- *      .../monc        - mon client state
- *      .../dentry_lru  - dump contents of dentry lru
- *      .../caps        - expose cap (reservation) stats
- *      .../bdi         - symlink to ../../bdi/something
- */
-static struct dentry *ceph_debugfs_dir;
-static int monmap_show(struct seq_file *s, void *p)
-{
-        int i;
-        struct ceph_client *client = s->private;
-        if (client->monc.monmap == NULL)
-                return 0;
-        seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
-        for (i = 0; i < client->monc.monmap->num_mon; i++) {
-                struct ceph_entity_inst *inst =
-                        &client->monc.monmap->mon_inst[i];
-                seq_printf(s, "\t%s%lld\t%s\n",
-                           ENTITY_NAME(inst->name),
-                           pr_addr(&inst->addr.in_addr));
-        }
-        return 0;
-}
 static int mdsmap_show(struct seq_file *s, void *p)
 {
        int i;
-        struct ceph_client *client = s->private;
+        struct ceph_fs_client *fsc = s->private;
-        if (client->mdsc.mdsmap == NULL)
+        if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
                return 0;
-        seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
+        seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
-        seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+        seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
        seq_printf(s, "session_timeout %d\n",
-                       client->mdsc.mdsmap->m_session_timeout);
+                       fsc->mdsc->mdsmap->m_session_timeout);
        seq_printf(s, "session_autoclose %d\n",
-                       client->mdsc.mdsmap->m_session_autoclose);
+                       fsc->mdsc->mdsmap->m_session_autoclose);
-        for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+        for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
                struct ceph_entity_addr *addr =
-                        &client->mdsc.mdsmap->m_info[i].addr;
+                        &fsc->mdsc->mdsmap->m_info[i].addr;
-                int state = client->mdsc.mdsmap->m_info[i].state;
+                int state = fsc->mdsc->mdsmap->m_info[i].state;
-                seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+                seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+                               ceph_pr_addr(&addr->in_addr),
                               ceph_mds_state_name(state));
        }
        return 0;
 }
-static int osdmap_show(struct seq_file *s, void *p)
+/*
-{
+ * mdsc debugfs
-        int i;
+ */
-        struct ceph_client *client = s->private;
-        struct rb_node *n;
-        if (client->osdc.osdmap == NULL)
-                return 0;
-        seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
-        seq_printf(s, "flags%s%s\n",
-                   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
-                   " NEARFULL" : "",
-                   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
-                   " FULL" : "");
-        for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
-                struct ceph_pg_pool_info *pool =
-                        rb_entry(n, struct ceph_pg_pool_info, node);
-                seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-                           pool->id, pool->v.pg_num, pool->pg_num_mask,
-                           pool->v.lpg_num, pool->lpg_num_mask);
-        }
-        for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
-                struct ceph_entity_addr *addr =
-                        &client->osdc.osdmap->osd_addr[i];
-                int state = client->osdc.osdmap->osd_state[i];
-                char sb[64];
-                seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
-                           i, pr_addr(&addr->in_addr),
-                           ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
-                           ceph_osdmap_state_str(sb, sizeof(sb), state));
-        }
-        return 0;
-}
-static int monc_show(struct seq_file *s, void *p)
-{
-        struct ceph_client *client = s->private;
-        struct ceph_mon_generic_request *req;
-        struct ceph_mon_client *monc = &client->monc;
-        struct rb_node *rp;
-        mutex_lock(&monc->mutex);
-        if (monc->have_mdsmap)
-                seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
-        if (monc->have_osdmap)
-                seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
-        if (monc->want_next_osdmap)
-                seq_printf(s, "want next osdmap\n");
-        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-                __u16 op;
-                req = rb_entry(rp, struct ceph_mon_generic_request, node);
-                op = le16_to_cpu(req->request->hdr.type);
-                if (op == CEPH_MSG_STATFS)
-                        seq_printf(s, "%lld statfs\n", req->tid);
-                else
-                        seq_printf(s, "%lld unknown\n", req->tid);
-        }
-        mutex_unlock(&monc->mutex);
-        return 0;
-}
 static int mdsc_show(struct seq_file *s, void *p)
 {
-        struct ceph_client *client = s->private;
+        struct ceph_fs_client *fsc = s->private;
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        struct rb_node *rp;
        int pathlen;
@@ -154,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
        for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
                req = rb_entry(rp, struct ceph_mds_request, r_node);
-                if (req->r_request)
+                if (req->r_request && req->r_session)
-                        seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+                        seq_printf(s, "%lld\tmds%d\t", req->r_tid,
-                else
+                                   req->r_session->s_mds);
+                else if (!req->r_request)
                        seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+                else
+                        seq_printf(s, "%lld\t(no session)\t", req->r_tid);
                seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
@@ -214,61 +123,12 @@ static int mdsc_show(struct seq_file *s, void *p)
        return 0;
 }
-static int osdc_show(struct seq_file *s, void *pp)
-{
-        struct ceph_client *client = s->private;
-        struct ceph_osd_client *osdc = &client->osdc;
-        struct rb_node *p;
-        mutex_lock(&osdc->request_mutex);
-        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-                struct ceph_osd_request *req;
-                struct ceph_osd_request_head *head;
-                struct ceph_osd_op *op;
-                int num_ops;
-                int opcode, olen;
-                int i;
-                req = rb_entry(p, struct ceph_osd_request, r_node);
-                seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
-                           req->r_osd ? req->r_osd->o_osd : -1,
-                           le32_to_cpu(req->r_pgid.pool),
-                           le16_to_cpu(req->r_pgid.ps));
-                head = req->r_request->front.iov_base;
-                op = (void *)(head + 1);
-                num_ops = le16_to_cpu(head->num_ops);
-                olen = le32_to_cpu(head->object_len);
-                seq_printf(s, "%.*s", olen,
-                           (const char *)(head->ops + num_ops));
-                if (req->r_reassert_version.epoch)
-                        seq_printf(s, "\t%u'%llu",
-                           (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
-                           le64_to_cpu(req->r_reassert_version.version));
-                else
-                        seq_printf(s, "\t");
-                for (i = 0; i < num_ops; i++) {
-                        opcode = le16_to_cpu(op->op);
-                        seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
-                        op++;
-                }
-                seq_printf(s, "\n");
-        }
-        mutex_unlock(&osdc->request_mutex);
-        return 0;
-}
 static int caps_show(struct seq_file *s, void *p)
 {
-        struct ceph_client *client = s->private;
+        struct ceph_fs_client *fsc = s->private;
        int total, avail, used, reserved, min;
-        ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+        ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
        seq_printf(s, "total\t\t%d\n"
                   "avail\t\t%d\n"
                   "used\t\t%d\n"
@@ -280,8 +140,8 @@ static int caps_show(struct seq_file *s, void *p)
 static int dentry_lru_show(struct seq_file *s, void *ptr)
 {
-        struct ceph_client *client = s->private;
+        struct ceph_fs_client *fsc = s->private;
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_dentry_info *di;
        spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +155,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
        return 0;
 }
-#define DEFINE_SHOW_FUNC(name)                                          \
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
-static int name##_open(struct inode *inode, struct file *file)          \
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
-{                                                                       \
+CEPH_DEFINE_SHOW_FUNC(caps_show)
-        struct seq_file *sf;                                            \
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
-        int ret;                                                        \
-                                                                        \
-        ret = single_open(file, name, NULL);                            \
-        sf = file->private_data;                                        \
-        sf->private = inode->i_private;                                 \
-        return ret;                                                     \
-}                                                                       \
-                                                                        \
-static const struct file_operations name##_fops = {                     \
-        .open           = name##_open,                                  \
-        .read           = seq_read,                                     \
-        .llseek         = seq_lseek,                                    \
-        .release        = single_release,                               \
-};
-DEFINE_SHOW_FUNC(monmap_show)
-DEFINE_SHOW_FUNC(mdsmap_show)
-DEFINE_SHOW_FUNC(osdmap_show)
-DEFINE_SHOW_FUNC(monc_show)
-DEFINE_SHOW_FUNC(mdsc_show)
-DEFINE_SHOW_FUNC(osdc_show)
-DEFINE_SHOW_FUNC(dentry_lru_show)
-DEFINE_SHOW_FUNC(caps_show)
+/*
+ * debugfs
+ */
 static int congestion_kb_set(void *data, u64 val)
 {
-        struct ceph_client *client = (struct ceph_client *)data;
+        struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
-        if (client)
-                client->mount_args->congestion_kb = (int)val;
+        fsc->mount_options->congestion_kb = (int)val;
        return 0;
 }
 static int congestion_kb_get(void *data, u64 *val)
 {
-        struct ceph_client *client = (struct ceph_client *)data;
+        struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
-        if (client)
-                *val = (u64)client->mount_args->congestion_kb;
+        *val = (u64)fsc->mount_options->congestion_kb;
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
                        congestion_kb_set, "%llu\n");
-int __init ceph_debugfs_init(void)
-{
-        ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
-        if (!ceph_debugfs_dir)
-                return -ENOMEM;
-        return 0;
-}
-void ceph_debugfs_cleanup(void)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
-        debugfs_remove(ceph_debugfs_dir);
+        dout("ceph_fs_debugfs_cleanup\n");
+        debugfs_remove(fsc->debugfs_bdi);
+        debugfs_remove(fsc->debugfs_congestion_kb);
+        debugfs_remove(fsc->debugfs_mdsmap);
+        debugfs_remove(fsc->debugfs_caps);
+        debugfs_remove(fsc->debugfs_mdsc);
+        debugfs_remove(fsc->debugfs_dentry_lru);
 }
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
-        int ret = 0;
+        char name[100];
-        char name[80];
+        int err = -ENOMEM;
-        snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
-                 client->monc.auth->global_id);
-        client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
-        if (!client->debugfs_dir)
-                goto out;
-        client->monc.debugfs_file = debugfs_create_file("monc",
+        dout("ceph_fs_debugfs_init\n");
-                                                      0600,
+        fsc->debugfs_congestion_kb =
-                                                      client->debugfs_dir,
+                debugfs_create_file("writeback_congestion_kb",
-                                                      client,
+                                    0600,
-                                                      &monc_show_fops);
+                                    fsc->client->debugfs_dir,
-        if (!client->monc.debugfs_file)
+                                    fsc,
-                goto out;
+                                    &congestion_kb_fops);
+        if (!fsc->debugfs_congestion_kb)
-        client->mdsc.debugfs_file = debugfs_create_file("mdsc",
-                                                      0600,
-                                                      client->debugfs_dir,
-                                                      client,
-                                                      &mdsc_show_fops);
-        if (!client->mdsc.debugfs_file)
                goto out;
-        client->osdc.debugfs_file = debugfs_create_file("osdc",
+        dout("a\n");
-                                                      0600,
-                                                      client->debugfs_dir,
-                                                      client,
-                                                      &osdc_show_fops);
-        if (!client->osdc.debugfs_file)
-                goto out;
-        client->debugfs_monmap = debugfs_create_file("monmap",
+        snprintf(name, sizeof(name), "../../bdi/%s",
-                                        0600,
+                 dev_name(fsc->backing_dev_info.dev));
-                                        client->debugfs_dir,
+        fsc->debugfs_bdi =
-                                        client,
+                debugfs_create_symlink("bdi",
-                                        &monmap_show_fops);
+                                       fsc->client->debugfs_dir,
-        if (!client->debugfs_monmap)
+                                       name);
+        if (!fsc->debugfs_bdi)
                goto out;
-        client->debugfs_mdsmap = debugfs_create_file("mdsmap",
+        dout("b\n");
+        fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
                                        0600,
-                                        client->debugfs_dir,
+                                        fsc->client->debugfs_dir,
-                                        client,
+                                        fsc,
                                        &mdsmap_show_fops);
-        if (!client->debugfs_mdsmap)
+        if (!fsc->debugfs_mdsmap)
                goto out;
-        client->debugfs_osdmap = debugfs_create_file("osdmap",
+        dout("ca\n");
-                                        0600,
+        fsc->debugfs_mdsc = debugfs_create_file("mdsc",
-                                        client->debugfs_dir,
+                                                0600,
-                                        client,
+                                                fsc->client->debugfs_dir,
-                                        &osdmap_show_fops);
+                                                fsc,
-        if (!client->debugfs_osdmap)
+                                                &mdsc_show_fops);
+        if (!fsc->debugfs_mdsc)
                goto out;
-        client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+        dout("da\n");
-                                        0600,
+        fsc->debugfs_caps = debugfs_create_file("caps",
-                                        client->debugfs_dir,
-                                        client,
-                                        &dentry_lru_show_fops);
-        if (!client->debugfs_dentry_lru)
-                goto out;
-        client->debugfs_caps = debugfs_create_file("caps",
                                                   0400,
-                                                   client->debugfs_dir,
+                                                   fsc->client->debugfs_dir,
-                                                   client,
+                                                   fsc,
                                                   &caps_show_fops);
-        if (!client->debugfs_caps)
+        if (!fsc->debugfs_caps)
                goto out;
-        client->debugfs_congestion_kb =
+        dout("ea\n");
-                debugfs_create_file("writeback_congestion_kb",
+        fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
-                                    0600,
+                                        0600,
-                                    client->debugfs_dir,
+                                        fsc->client->debugfs_dir,
-                                    client,
+                                        fsc,
-                                    &congestion_kb_fops);
+                                        &dentry_lru_show_fops);
-        if (!client->debugfs_congestion_kb)
+        if (!fsc->debugfs_dentry_lru)
                goto out;
-        sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
-        client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
-                                                     name);
        return 0;
 out:
-        ceph_debugfs_client_cleanup(client);
+        ceph_fs_debugfs_cleanup(fsc);
-        return ret;
+        return err;
 }
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
-{
-        debugfs_remove(client->debugfs_bdi);
-        debugfs_remove(client->debugfs_caps);
-        debugfs_remove(client->debugfs_dentry_lru);
-        debugfs_remove(client->debugfs_osdmap);
-        debugfs_remove(client->debugfs_mdsmap);
-        debugfs_remove(client->debugfs_monmap);
-        debugfs_remove(client->osdc.debugfs_file);
-        debugfs_remove(client->mdsc.debugfs_file);
-        debugfs_remove(client->monc.debugfs_file);
-        debugfs_remove(client->debugfs_congestion_kb);
-        debugfs_remove(client->debugfs_dir);
-}
 #else  /* CONFIG_DEBUG_FS */
-int __init ceph_debugfs_init(void)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
-{
-        return 0;
-}
-void ceph_debugfs_cleanup(void)
-{
-}
-int ceph_debugfs_client_init(struct ceph_client *client)
 {
        return 0;
 }
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
 }
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
deleted file mode 100644
index 3d25415afe63..000000000000
--- a/fs/ceph/decode.h
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef __CEPH_DECODE_H
-#define __CEPH_DECODE_H
-#include <asm/unaligned.h>
-#include <linux/time.h>
-#include "types.h"
-/*
- * in all cases,
- *   void **p     pointer to position pointer
- *   void *end    pointer to end of buffer (last byte + 1)
- */
-static inline u64 ceph_decode_64(void **p)
-{
-        u64 v = get_unaligned_le64(*p);
-        *p += sizeof(u64);
-        return v;
-}
-static inline u32 ceph_decode_32(void **p)
-{
-        u32 v = get_unaligned_le32(*p);
-        *p += sizeof(u32);
-        return v;
-}
-static inline u16 ceph_decode_16(void **p)
-{
-        u16 v = get_unaligned_le16(*p);
-        *p += sizeof(u16);
-        return v;
-}
-static inline u8 ceph_decode_8(void **p)
-{
-        u8 v = *(u8 *)*p;
-        (*p)++;
-        return v;
-}
-static inline void ceph_decode_copy(void **p, void *pv, size_t n)
-{
-        memcpy(pv, *p, n);
-        *p += n;
-}
-/*
- * bounds check input.
- */
-#define ceph_decode_need(p, end, n, bad)                \
-        do {                                            \
-                if (unlikely(*(p) + (n) > (end)))       \
-                        goto bad;                       \
-        } while (0)
-#define ceph_decode_64_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_decode_need(p, end, sizeof(u64), bad);     \
-                v = ceph_decode_64(p);                          \
-        } while (0)
-#define ceph_decode_32_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_decode_need(p, end, sizeof(u32), bad);     \
-                v = ceph_decode_32(p);                          \
-        } while (0)
-#define ceph_decode_16_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_decode_need(p, end, sizeof(u16), bad);     \
-                v = ceph_decode_16(p);                          \
-        } while (0)
-#define ceph_decode_8_safe(p, end, v, bad)                      \
-        do {                                                    \
-                ceph_decode_need(p, end, sizeof(u8), bad);      \
-                v = ceph_decode_8(p);                           \
-        } while (0)
-#define ceph_decode_copy_safe(p, end, pv, n, bad)               \
-        do {                                                    \
-                ceph_decode_need(p, end, n, bad);               \
-                ceph_decode_copy(p, pv, n);                     \
-        } while (0)
-/*
- * struct ceph_timespec <-> struct timespec
- */
-static inline void ceph_decode_timespec(struct timespec *ts,
-                                        const struct ceph_timespec *tv)
-{
-        ts->tv_sec = le32_to_cpu(tv->tv_sec);
-        ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
-}
-static inline void ceph_encode_timespec(struct ceph_timespec *tv,
-                                        const struct timespec *ts)
-{
-        tv->tv_sec = cpu_to_le32(ts->tv_sec);
-        tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
-}
-/*
- * sockaddr_storage <-> ceph_sockaddr
- */
-static inline void ceph_encode_addr(struct ceph_entity_addr *a)
-{
-        __be16 ss_family = htons(a->in_addr.ss_family);
-        a->in_addr.ss_family = *(__u16 *)&ss_family;
-}
-static inline void ceph_decode_addr(struct ceph_entity_addr *a)
-{
-        __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
-        a->in_addr.ss_family = ntohs(ss_family);
-        WARN_ON(a->in_addr.ss_family == 512);
-}
-/*
- * encoders
- */
-static inline void ceph_encode_64(void **p, u64 v)
-{
-        put_unaligned_le64(v, (__le64 *)*p);
-        *p += sizeof(u64);
-}
-static inline void ceph_encode_32(void **p, u32 v)
-{
-        put_unaligned_le32(v, (__le32 *)*p);
-        *p += sizeof(u32);
-}
-static inline void ceph_encode_16(void **p, u16 v)
-{
-        put_unaligned_le16(v, (__le16 *)*p);
-        *p += sizeof(u16);
-}
-static inline void ceph_encode_8(void **p, u8 v)
-{
-        *(u8 *)*p = v;
-        (*p)++;
-}
-static inline void ceph_encode_copy(void **p, const void *s, int len)
-{
-        memcpy(*p, s, len);
-        *p += len;
-}
-/*
- * filepath, string encoders
- */
-static inline void ceph_encode_filepath(void **p, void *end,
-                                        u64 ino, const char *path)
-{
-        u32 len = path ? strlen(path) : 0;
-        BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
-        ceph_encode_8(p, 1);
-        ceph_encode_64(p, ino);
-        ceph_encode_32(p, len);
-        if (len)
-                memcpy(*p, path, len);
-        *p += len;
-}
-static inline void ceph_encode_string(void **p, void *end,
-                                      const char *s, u32 len)
-{
-        BUG_ON(*p + sizeof(len) + len > end);
-        ceph_encode_32(p, len);
-        if (len)
-                memcpy(*p, s, len);
-        *p += len;
-}
-#define ceph_encode_need(p, end, n, bad)                \
-        do {                                            \
-                if (unlikely(*(p) + (n) > (end)))       \
-                        goto bad;                       \
-        } while (0)
-#define ceph_encode_64_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_encode_need(p, end, sizeof(u64), bad);     \
-                ceph_encode_64(p, v);                           \
-        } while (0)
-#define ceph_encode_32_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_encode_need(p, end, sizeof(u32), bad);     \
-                ceph_encode_32(p, v);                   \
-        } while (0)
-#define ceph_encode_16_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_encode_need(p, end, sizeof(u16), bad);     \
-                ceph_encode_16(p, v);                   \
-        } while (0)
-#define ceph_encode_copy_safe(p, end, pv, n, bad)               \
-        do {                                                    \
-                ceph_encode_need(p, end, n, bad);               \
-                ceph_encode_copy(p, pv, n);                     \
-        } while (0)
-#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a1986eb52045..0bc68de8edd7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/spinlock.h>
 #include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include "super.h"
+#include "mds_client.h"
 /*
 * Directory operations: readdir, lookup, create, link, unlink,
@@ -39,12 +40,13 @@ int ceph_init_dentry(struct dentry *dentry)
        if (dentry->d_fsdata)
                return 0;
-        if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+        if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
-                dentry->d_op = &ceph_dentry_ops;
+            ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+                d_set_d_op(dentry, &ceph_dentry_ops);
        else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
-                dentry->d_op = &ceph_snapdir_dentry_ops;
+                d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
        else
-                dentry->d_op = &ceph_snap_dentry_ops;
+                d_set_d_op(dentry, &ceph_snap_dentry_ops);
        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
        if (!di)
@@ -94,10 +96,7 @@ static unsigned fpos_off(loff_t p)
 */
 static int __dcache_readdir(struct file *filp,
                            void *dirent, filldir_t filldir)
-                __releases(inode->i_lock)
-                __acquires(inode->i_lock)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
        struct ceph_file_info *fi = filp->private_data;
        struct dentry *parent = filp->f_dentry;
        struct inode *dir = parent->d_inode;
@@ -113,11 +112,11 @@ static int __dcache_readdir(struct file *filp,
        dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
             last);
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        /* start at beginning? */
-        if (filp->f_pos == 2 || (last &&
+        if (filp->f_pos == 2 || last == NULL ||
-                                 filp->f_pos < ceph_dentry(last)->offset)) {
+            filp->f_pos < ceph_dentry(last)->offset) {
                if (list_empty(&parent->d_subdirs))
                        goto out_unlock;
                p = parent->d_subdirs.prev;
@@ -137,6 +136,7 @@ more:
                        fi->at_end = 1;
                        goto out_unlock;
                }
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                if (!d_unhashed(dentry) && dentry->d_inode &&
                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
@@ -146,14 +146,15 @@ more:
                     dentry->d_name.len, dentry->d_name.name, di->offset,
                     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
                     !dentry->d_inode ? " null" : "");
+                spin_unlock(&dentry->d_lock);
                p = p->prev;
                dentry = list_entry(p, struct dentry, d_u.d_child);
                di = ceph_dentry(dentry);
        }
-        atomic_inc(&dentry->d_count);
+        dget_dlock(dentry);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&parent->d_lock);
        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
             dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +172,30 @@ more:
                } else {
                        dput(last);
                }
-                last = NULL;
        }
-        spin_lock(&inode->i_lock);
-        spin_lock(&dcache_lock);
        last = dentry;
        if (err < 0)
-                goto out_unlock;
+                goto out;
-        p = p->prev;
        filp->f_pos++;
-        /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+        /* make sure a dentry wasn't dropped while we didn't have parent lock */
-        if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
+        if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
-                goto more;
+                dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
-        dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+                err = -EAGAIN;
-        err = -EAGAIN;
+                goto out;
+        }
-out_unlock:
+        spin_lock(&parent->d_lock);
-        spin_unlock(&dcache_lock);
+        p = p->prev;    /* advance to next dentry */
+        goto more;
-        if (last) {
+out_unlock:
-                spin_unlock(&inode->i_lock);
+        spin_unlock(&parent->d_lock);
+out:
+        if (last)
                dput(last);
-                spin_lock(&inode->i_lock);
-        }
        return err;
 }
@@ -227,15 +223,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct ceph_file_info *fi = filp->private_data;
        struct inode *inode = filp->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        unsigned frag = fpos_frag(filp->f_pos);
        int off = fpos_off(filp->f_pos);
        int err;
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
-        const int max_entries = client->mount_args->max_readdir;
+        const int max_entries = fsc->mount_options->max_readdir;
-        const int max_bytes = client->mount_args->max_readdir_bytes;
+        const int max_bytes = fsc->mount_options->max_readdir_bytes;
        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
        if (fi->at_end)
@@ -267,17 +263,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        /* can we use the dcache? */
        spin_lock(&inode->i_lock);
        if ((filp->f_pos == 2 || fi->dentry) &&
-            !ceph_test_opt(client, NOASYNCREADDIR) &&
+            !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
            ceph_snap(inode) != CEPH_SNAPDIR &&
            (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+                spin_unlock(&inode->i_lock);
                err = __dcache_readdir(filp, dirent, filldir);
-                if (err != -EAGAIN) {
+                if (err != -EAGAIN)
-                        spin_unlock(&inode->i_lock);
                        return err;
-                }
+        } else {
+                spin_unlock(&inode->i_lock);
        }
-        spin_unlock(&inode->i_lock);
        if (fi->dentry) {
                err = note_last_dentry(fi, fi->dentry->d_name.name,
                                       fi->dentry->d_name.len);
@@ -344,7 +340,10 @@ more:
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
-                        fi->next_offset = 2;
+                        if (ceph_frag_is_rightmost(frag))
+                                fi->next_offset = 2;
+                        else
+                                fi->next_offset = 0;
                } else {
                        rinfo = &req->r_reply_info;
                        err = note_last_dentry(fi,
@@ -363,18 +362,22 @@ more:
                u64 pos = ceph_make_fpos(frag, off);
                struct ceph_mds_reply_inode *in =
                        rinfo->dir_in[off - fi->offset].in;
+                struct ceph_vino vino;
+                ino_t ino;
                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
                     off, off - fi->offset, rinfo->dir_nr, pos,
                     rinfo->dir_dname_len[off - fi->offset],
                     rinfo->dir_dname[off - fi->offset], in);
                BUG_ON(!in);
                ftype = le32_to_cpu(in->mode) >> 12;
+                vino.ino = le64_to_cpu(in->ino);
+                vino.snap = le64_to_cpu(in->snapid);
+                ino = ceph_vino_to_ino(vino);
                if (filldir(dirent,
                            rinfo->dir_dname[off - fi->offset],
                            rinfo->dir_dname_len[off - fi->offset],
-                            pos,
+                            pos, ino, ftype) < 0) {
-                            le64_to_cpu(in->ino),
-                            ftype) < 0) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
@@ -422,6 +425,7 @@ static void reset_readdir(struct ceph_file_info *fi)
                fi->last_readdir = NULL;
        }
        kfree(fi->last_name);
+        fi->last_name = NULL;
        fi->next_offset = 2;  /* compensate for . and .. */
        if (fi->dentry) {
                dput(fi->dentry);
@@ -487,14 +491,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                  struct dentry *dentry, int err)
 {
-        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
        struct inode *parent = dentry->d_parent->d_inode;
        /* .snap dir? */
        if (err == -ENOENT &&
-            ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
            strcmp(dentry->d_name.name,
-                   client->mount_args->snapdir_name) == 0) {
+                   fsc->mount_options->snapdir_name) == 0) {
                struct inode *inode = ceph_get_snapdir(parent);
                dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
                     dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +542,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                                  struct nameidata *nd)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int op;
        int err;
@@ -572,7 +575,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                spin_lock(&dir->i_lock);
                dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
                if (strncmp(dentry->d_name.name,
-                            client->mount_args->snapdir_name,
+                            fsc->mount_options->snapdir_name,
                            dentry->d_name.len) &&
                    !is_root_ceph_dentry(dir, dentry) &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +632,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 static int ceph_mknod(struct inode *dir, struct dentry *dentry,
                      int mode, dev_t rdev)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err;
@@ -685,8 +688,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
 static int ceph_symlink(struct inode *dir, struct dentry *dentry,
                            const char *dest)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err;
@@ -716,8 +719,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err = -EROFS;
        int op;
@@ -758,8 +761,8 @@ out:
 static int ceph_link(struct dentry *old_dentry, struct inode *dir,
                     struct dentry *dentry)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err;
@@ -813,8 +816,8 @@ static int drop_caps_for_unlink(struct inode *inode)
 */
 static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = dentry->d_inode;
        struct ceph_mds_request *req;
        int err = -EROFS;
@@ -854,8 +857,8 @@ out:
 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                       struct inode *new_dir, struct dentry *new_dentry)
 {
-        struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err;
@@ -987,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 */
 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct inode *dir;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        dir = dentry->d_parent->d_inode;
        dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
             dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
@@ -1076,7 +1084,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
        struct ceph_inode_info *ci = ceph_inode(inode);
        int left;
-        if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+        if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                return -EISDIR;
        if (!cf->dir_info) {
@@ -1177,7 +1185,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
        dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+                mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_add_tail(&di->lru, &mdsc->dentry_lru);
                mdsc->num_dentry++;
@@ -1193,7 +1201,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
        dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
             dn->d_name.len, dn->d_name.name, di->offset);
        if (di) {
-                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+                mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_move_tail(&di->lru, &mdsc->dentry_lru);
                spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1216,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
        dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+                mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_del_init(&di->lru);
                mdsc->num_dentry--;
@@ -1216,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
        }
 }
+/*
+ * Return name hash for a given dentry.  This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct dentry *dn)
+{
+        struct inode *dir = dn->d_parent->d_inode;
+        struct ceph_inode_info *dci = ceph_inode(dir);
+        switch (dci->i_dir_layout.dl_dir_hash) {
+        case 0: /* for backward compat */
+        case CEPH_STR_HASH_LINUX:
+                return dn->d_name.hash;
+        default:
+                return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+                                     dn->d_name.name, dn->d_name.len);
+        }
+}
 const struct file_operations ceph_dir_fops = {
        .read = ceph_read_dir,
        .readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e38423e82f2e..e41056174bf8 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/exportfs.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "super.h"
+#include "mds_client.h"
 /*
 * NFS export support
@@ -58,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                dout("encode_fh %p connectable\n", dentry);
                cfh->ino = ceph_ino(dentry->d_inode);
                cfh->parent_ino = ceph_ino(parent->d_inode);
-                cfh->parent_name_hash = parent->d_name.hash;
+                cfh->parent_name_hash = ceph_dentry_hash(parent);
                *max_len = connected_handle_length;
                type = 2;
        } else if (*max_len >= handle_length) {
@@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
                                      struct ceph_nfs_confh *cfh)
 {
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
        struct dentry *dentry;
        struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66e4da6dba22..7d0e4a82d898 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -38,8 +39,8 @@
 static struct ceph_mds_request *
 prepare_open_request(struct super_block *sb, int flags, int create_mode)
 {
-        struct ceph_client *client = ceph_sb_to_client(sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int want_auth = USE_ANY_MDS;
        int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 int ceph_open(struct inode *inode, struct file *file)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        struct ceph_file_info *cf = file->private_data;
        struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -153,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
        }
        /*
-         * No need to block if we have any caps.  Update wanted set
+         * No need to block if we have caps on the auth MDS (for
+         * write) or any MDS (for read).  Update wanted set
         * asynchronously.
         */
        spin_lock(&inode->i_lock);
-        if (__ceph_is_any_real_caps(ci)) {
+        if (__ceph_is_any_real_caps(ci) &&
+            (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
                int mds_wanted = __ceph_caps_mds_wanted(ci);
                int issued = __ceph_caps_issued(ci, NULL);
@@ -216,8 +219,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
                                struct nameidata *nd, int mode,
                                int locked_dir)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct file *file = nd->intent.open.file;
        struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
        struct ceph_mds_request *req;
@@ -270,163 +273,6 @@ int ceph_release(struct inode *inode, struct file *file)
 }
 /*
- * build a vector of user pages
- */
-static struct page **get_direct_page_vector(const char __user *data,
-                                            int num_pages,
-                                            loff_t off, size_t len)
-{
-        struct page **pages;
-        int rc;
-        pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
-        if (!pages)
-                return ERR_PTR(-ENOMEM);
-        down_read(&current->mm->mmap_sem);
-        rc = get_user_pages(current, current->mm, (unsigned long)data,
-                            num_pages, 0, 0, pages, NULL);
-        up_read(&current->mm->mmap_sem);
-        if (rc < 0)
-                goto fail;
-        return pages;
-fail:
-        kfree(pages);
-        return ERR_PTR(rc);
-}
-static void put_page_vector(struct page **pages, int num_pages)
-{
-        int i;
-        for (i = 0; i < num_pages; i++)
-                put_page(pages[i]);
-        kfree(pages);
-}
-void ceph_release_page_vector(struct page **pages, int num_pages)
-{
-        int i;
-        for (i = 0; i < num_pages; i++)
-                __free_pages(pages[i], 0);
-        kfree(pages);
-}
-/*
- * allocate a vector new pages
- */
-static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
-{
-        struct page **pages;
-        int i;
-        pages = kmalloc(sizeof(*pages) * num_pages, flags);
-        if (!pages)
-                return ERR_PTR(-ENOMEM);
-        for (i = 0; i < num_pages; i++) {
-                pages[i] = __page_cache_alloc(flags);
-                if (pages[i] == NULL) {
-                        ceph_release_page_vector(pages, i);
-                        return ERR_PTR(-ENOMEM);
-                }
-        }
-        return pages;
-}
-/*
- * copy user data into a page vector
- */
-static int copy_user_to_page_vector(struct page **pages,
-                                    const char __user *data,
-                                    loff_t off, size_t len)
-{
-        int i = 0;
-        int po = off & ~PAGE_CACHE_MASK;
-        int left = len;
-        int l, bad;
-        while (left > 0) {
-                l = min_t(int, PAGE_CACHE_SIZE-po, left);
-                bad = copy_from_user(page_address(pages[i]) + po, data, l);
-                if (bad == l)
-                        return -EFAULT;
-                data += l - bad;
-                left -= l - bad;
-                po += l - bad;
-                if (po == PAGE_CACHE_SIZE) {
-                        po = 0;
-                        i++;
-                }
-        }
-        return len;
-}
-/*
- * copy user data from a page vector into a user pointer
- */
-static int copy_page_vector_to_user(struct page **pages, char __user *data,
-                                    loff_t off, size_t len)
-{
-        int i = 0;
-        int po = off & ~PAGE_CACHE_MASK;
-        int left = len;
-        int l, bad;
-        while (left > 0) {
-                l = min_t(int, left, PAGE_CACHE_SIZE-po);
-                bad = copy_to_user(data, page_address(pages[i]) + po, l);
-                if (bad == l)
-                        return -EFAULT;
-                data += l - bad;
-                left -= l - bad;
-                if (po) {
-                        po += l - bad;
-                        if (po == PAGE_CACHE_SIZE)
-                                po = 0;
-                }
-                i++;
-        }
-        return len;
-}
-/*
- * Zero an extent within a page vector.  Offset is relative to the
- * start of the first page.
- */
-static void zero_page_vector_range(int off, int len, struct page **pages)
-{
-        int i = off >> PAGE_CACHE_SHIFT;
-        off &= ~PAGE_CACHE_MASK;
-        dout("zero_page_vector_page %u~%u\n", off, len);
-        /* leading partial page? */
-        if (off) {
-                int end = min((int)PAGE_CACHE_SIZE, off + len);
-                dout("zeroing %d %p head from %d\n", i, pages[i],
-                     (int)off);
-                zero_user_segment(pages[i], off, end);
-                len -= (end - off);
-                i++;
-        }
-        while (len >= PAGE_CACHE_SIZE) {
-                dout("zeroing %d %p len=%d\n", i, pages[i], len);
-                zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
-                len -= PAGE_CACHE_SIZE;
-                i++;
-        }
-        /* trailing partial page? */
-        if (len) {
-                dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
-                zero_user_segment(pages[i], 0, len);
-        }
-}
-/*
 * Read a range of bytes striped over one or more objects.  Iterate over
 * objects we stripe over.  (That's not atomic, but good enough for now.)
 *
@@ -436,11 +282,13 @@ static void zero_page_vector_range(int off, int len, struct page **pages)
 static int striped_read(struct inode *inode,
                        u64 off, u64 len,
                        struct page **pages, int num_pages,
-                        int *checkeof)
+                        int *checkeof, bool align_to_pages,
+                        unsigned long buf_align)
 {
-        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
        u64 pos, this_len;
+        int io_align, page_align;
        int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
        int left, pages_left;
        int read;
@@ -456,14 +304,19 @@ static int striped_read(struct inode *inode,
        page_pos = pages;
        pages_left = num_pages;
        read = 0;
+        io_align = off & ~PAGE_MASK;
 more:
+        if (align_to_pages)
+                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+        else
+                page_align = pos & ~PAGE_MASK;
        this_len = left;
-        ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+        ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
                                  &ci->i_layout, pos, &this_len,
                                  ci->i_truncate_seq,
                                  ci->i_truncate_size,
-                                  page_pos, pages_left);
+                                  page_pos, pages_left, page_align);
        hit_stripe = this_len < left;
        was_short = ret >= 0 && ret < this_len;
        if (ret == -ENOENT)
@@ -477,8 +330,8 @@ more:
                if (read < pos - off) {
                        dout(" zero gap %llu to %llu\n", off + read, pos);
-                        zero_page_vector_range(page_off + read,
+                        ceph_zero_page_vector_range(page_off + read,
-                                               pos - off - read, pages);
+                                                    pos - off - read, pages);
                }
                pos += ret;
                read = pos - off;
@@ -495,8 +348,8 @@ more:
                /* was original extent fully inside i_size? */
                if (pos + left <= inode->i_size) {
                        dout("zero tail\n");
-                        zero_page_vector_range(page_off + read, len - read,
+                        ceph_zero_page_vector_range(page_off + read, len - read,
-                                               pages);
+                                                    pages);
                        read = len;
                        goto out;
                }
@@ -524,41 +377,43 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
        struct inode *inode = file->f_dentry->d_inode;
        struct page **pages;
        u64 off = *poff;
-        int num_pages = calc_pages_for(off, len);
+        int num_pages, ret;
-        int ret;
        dout("sync_read on file %p %llu~%u %s\n", file, off, len,
             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
        if (file->f_flags & O_DIRECT) {
-                pages = get_direct_page_vector(data, num_pages, off, len);
+                num_pages = calc_pages_for((unsigned long)data, len);
+                pages = ceph_get_direct_page_vector(data, num_pages, true);
-                /*
-                 * flush any page cache pages in this range.  this
-                 * will make concurrent normal and O_DIRECT io slow,
-                 * but it will at least behave sensibly when they are
-                 * in sequence.
-                 */
        } else {
+                num_pages = calc_pages_for(off, len);
                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
        }
        if (IS_ERR(pages))
                return PTR_ERR(pages);
+        /*
+         * flush any page cache pages in this range.  this
+         * will make concurrent normal and sync io slow,
+         * but it will at least behave sensibly when they are
+         * in sequence.
+         */
        ret = filemap_write_and_wait(inode->i_mapping);
        if (ret < 0)
                goto done;
-        ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+        ret = striped_read(inode, off, len, pages, num_pages, checkeof,
+                           file->f_flags & O_DIRECT,
+                           (unsigned long)data & ~PAGE_MASK);
        if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
-                ret = copy_page_vector_to_user(pages, data, off, ret);
+                ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
        if (ret >= 0)
                *poff = off + ret;
 done:
        if (file->f_flags & O_DIRECT)
-                put_page_vector(pages, num_pages);
+                ceph_put_page_vector(pages, num_pages, true);
        else
                ceph_release_page_vector(pages, num_pages);
        dout("sync_read result %d\n", ret);
@@ -594,7 +449,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_osd_request *req;
        struct page **pages;
        int num_pages;
@@ -604,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
        int flags;
        int do_sync = 0;
        int check_caps = 0;
+        int page_align, io_align;
+        unsigned long buf_align;
        int ret;
        struct timespec mtime = CURRENT_TIME;
@@ -618,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
        else
                pos = *offset;
+        io_align = pos & ~PAGE_MASK;
+        buf_align = (unsigned long)data & ~PAGE_MASK;
        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
        if (ret < 0)
                return ret;
@@ -642,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
         */
 more:
        len = left;
-        req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+        if (file->f_flags & O_DIRECT) {
+                /* write from beginning of first page, regardless of
+                   io alignment */
+                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+                num_pages = calc_pages_for((unsigned long)data, len);
+        } else {
+                page_align = pos & ~PAGE_MASK;
+                num_pages = calc_pages_for(pos, len);
+        }
+        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), pos, &len,
                                    CEPH_OSD_OP_WRITE, flags,
                                    ci->i_snap_realm->cached_context,
                                    do_sync,
                                    ci->i_truncate_seq, ci->i_truncate_size,
-                                    &mtime, false, 2);
+                                    &mtime, false, 2, page_align);
        if (!req)
                return -ENOMEM;
-        num_pages = calc_pages_for(pos, len);
        if (file->f_flags & O_DIRECT) {
-                pages = get_direct_page_vector(data, num_pages, pos, len);
+                pages = ceph_get_direct_page_vector(data, num_pages, false);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@ -673,7 +540,7 @@ more:
                        ret = PTR_ERR(pages);
                        goto out;
                }
-                ret = copy_user_to_page_vector(pages, data, pos, len);
+                ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
                if (ret < 0) {
                        ceph_release_page_vector(pages, num_pages);
                        goto out;
@@ -689,7 +556,7 @@ more:
        req->r_num_pages = num_pages;
        req->r_inode = inode;
-        ret = ceph_osdc_start_request(&client->osdc, req, false);
+        ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!ret) {
                if (req->r_safe_callback) {
                        /*
@@ -701,11 +568,11 @@ more:
                        spin_unlock(&ci->i_unsafe_lock);
                        ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
                }
-                ret = ceph_osdc_wait_request(&client->osdc, req);
+                ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
        }
        if (file->f_flags & O_DIRECT)
-                put_page_vector(pages, num_pages);
+                ceph_put_page_vector(pages, num_pages, false);
        else if (file->f_flags & O_SYNC)
                ceph_release_page_vector(pages, num_pages);
@@ -814,7 +681,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc =
+                &ceph_sb_to_client(inode->i_sb)->client->osdc;
        loff_t endoff = pos + iov->iov_len;
        int want, got = 0;
        int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 62377ec37edf..5625463aa479 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,8 +1,7 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
@@ -13,7 +12,8 @@
 #include <linux/pagevec.h>
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
 /*
 * Ceph inode operations
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_release_count = 0;
        ci->i_symlink = NULL;
+        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
        ci->i_fragtree = RB_ROOT;
        mutex_init(&ci->i_fragtree_mutex);
@@ -368,6 +370,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        return &ci->vfs_inode;
 }
+static void ceph_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ceph_inode_cachep, ci);
+}
 void ceph_destroy_inode(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -384,7 +395,7 @@ void ceph_destroy_inode(struct inode *inode)
         */
        if (ci->i_snap_realm) {
                struct ceph_mds_client *mdsc =
-                        &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+                        ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
                struct ceph_snap_realm *realm = ci->i_snap_realm;
                dout(" dropping residual ref to snap realm %p\n", realm);
@@ -407,7 +418,7 @@ void ceph_destroy_inode(struct inode *inode)
        if (ci->i_xattrs.prealloc_blob)
                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
-        kmem_cache_free(ceph_inode_cachep, ci);
+        call_rcu(&inode->i_rcu, ceph_i_callback);
 }
@@ -470,7 +481,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
        if (issued & (CEPH_CAP_FILE_EXCL|
                      CEPH_CAP_FILE_WR|
-                      CEPH_CAP_FILE_BUFFER)) {
+                      CEPH_CAP_FILE_BUFFER|
+                      CEPH_CAP_AUTH_EXCL|
+                      CEPH_CAP_XATTR_EXCL)) {
                if (timespec_compare(ctime, &inode->i_ctime) > 0) {
                        dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
                             inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -510,7 +523,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
                        warn = 1;
                }
        } else {
-                /* we have no write caps; whatever the MDS says is true */
+                /* we have no write|excl caps; whatever the MDS says is true */
                if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
                        inode->i_ctime = *ctime;
                        inode->i_mtime = *mtime;
@@ -566,12 +579,17 @@ static int fill_inode(struct inode *inode,
        /*
         * provided version will be odd if inode value is projected,
-         * even if stable.  skip the update if we have a newer info
+         * even if stable.  skip the update if we have newer stable
-         * (e.g., due to inode info racing form multiple MDSs), or if
+         * info (ours>=theirs, e.g. due to racing mds replies), unless
-         * we are getting projected (unstable) inode info.
+         * we are getting projected (unstable) info (in which case the
+         * version is odd, and we want ours>theirs).
+         *   us   them
+         *   2    2     skip
+         *   3    2     skip
+         *   3    3     update
         */
        if (le64_to_cpu(info->version) > 0 &&
-            (ci->i_version & ~1) > le64_to_cpu(info->version))
+            (ci->i_version & ~1) >= le64_to_cpu(info->version))
                goto no_change;
        issued = __ceph_caps_issued(ci, &implemented);
@@ -605,7 +623,14 @@ static int fill_inode(struct inode *inode,
                            le32_to_cpu(info->time_warp_seq),
                            &ctime, &mtime, &atime);
-        ci->i_max_size = le64_to_cpu(info->max_size);
+        /* only update max_size on auth cap */
+        if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+            ci->i_max_size != le64_to_cpu(info->max_size)) {
+                dout("max_size %lld -> %llu\n", ci->i_max_size,
+                     le64_to_cpu(info->max_size));
+                ci->i_max_size = le64_to_cpu(info->max_size);
+        }
        ci->i_layout = info->layout;
        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
@@ -666,6 +691,8 @@ static int fill_inode(struct inode *inode,
                inode->i_op = &ceph_dir_iops;
                inode->i_fop = &ceph_dir_fops;
+                ci->i_dir_layout = iinfo->dir_layout;
                ci->i_files = le64_to_cpu(info->files);
                ci->i_subdirs = le64_to_cpu(info->subdirs);
                ci->i_rbytes = le64_to_cpu(info->rbytes);
@@ -683,10 +710,6 @@ static int fill_inode(struct inode *inode,
                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
                        ci->i_max_offset = 2;
                }
-                /* it may be better to set st_size in getattr instead? */
-                if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
-                        inode->i_size = ci->i_rbytes;
                break;
        default:
                pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -827,13 +850,13 @@ static void ceph_set_dentry_offset(struct dentry *dn)
        di->offset = ceph_inode(inode)->i_max_offset++;
        spin_unlock(&inode->i_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&dir->d_lock);
-        spin_lock(&dn->d_lock);
+        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&dn->d_u.d_child, &dir->d_subdirs);
        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
             dn->d_u.d_child.prev, dn->d_u.d_child.next);
        spin_unlock(&dn->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dir->d_lock);
 }
 /*
@@ -865,8 +888,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        } else if (realdn) {
                dout("dn %p (%d) spliced with %p (%d) "
                     "inode %p ino %llx.%llx\n",
-                     dn, atomic_read(&dn->d_count),
+                     dn, dn->d_count,
-                     realdn, atomic_read(&realdn->d_count),
+                     realdn, realdn->d_count,
                     realdn->d_inode, ceph_vinop(realdn->d_inode));
                dput(dn);
                dn = realdn;
@@ -901,7 +924,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        struct inode *in = NULL;
        struct ceph_mds_reply_inode *ininfo;
        struct ceph_vino vino;
-        struct ceph_client *client = ceph_sb_to_client(sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
        int i = 0;
        int err = 0;
@@ -965,7 +988,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
         */
        if (rinfo->head->is_dentry && !req->r_aborted &&
            (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
-                                               client->mount_args->snapdir_name,
+                                               fsc->mount_options->snapdir_name,
                                               req->r_dentry->d_name.len))) {
                /*
                 * lookup link rename   : null -> possibly existing inode
@@ -1054,7 +1077,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                ininfo = rinfo->targeti.in;
                vino.ino = le64_to_cpu(ininfo->ino);
                vino.snap = le64_to_cpu(ininfo->snapid);
-                if (!dn->d_inode) {
+                in = dn->d_inode;
+                if (!in) {
                        in = ceph_get_inode(sb, vino);
                        if (IS_ERR(in)) {
                                pr_err("fill_trace bad get_inode "
@@ -1216,11 +1240,11 @@ retry_lookup:
                        goto retry_lookup;
                } else {
                        /* reorder parent's d_subdirs */
-                        spin_lock(&dcache_lock);
+                        spin_lock(&parent->d_lock);
-                        spin_lock(&dn->d_lock);
+                        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
                        list_move(&dn->d_u.d_child, &parent->d_subdirs);
                        spin_unlock(&dn->d_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&parent->d_lock);
                }
                di = dn->d_fsdata;
@@ -1385,11 +1409,8 @@ static void ceph_invalidate_work(struct work_struct *work)
        spin_lock(&inode->i_lock);
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
-        if (ci->i_rdcache_gen == 0 ||
+        if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-            ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-                BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
                /* nevermind! */
-                ci->i_rdcache_revoking = 0;
                spin_unlock(&inode->i_lock);
                goto out;
        }
@@ -1399,15 +1420,16 @@ static void ceph_invalidate_work(struct work_struct *work)
        ceph_invalidate_nondirty_pages(inode->i_mapping);
        spin_lock(&inode->i_lock);
-        if (orig_gen == ci->i_rdcache_gen) {
+        if (orig_gen == ci->i_rdcache_gen &&
+            orig_gen == ci->i_rdcache_revoking) {
                dout("invalidate_pages %p gen %d successful\n", inode,
                     ci->i_rdcache_gen);
-                ci->i_rdcache_gen = 0;
+                ci->i_rdcache_revoking--;
-                ci->i_rdcache_revoking = 0;
                check = 1;
        } else {
-                dout("invalidate_pages %p gen %d raced, gen now %d\n",
+                dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
-                     inode, orig_gen, ci->i_rdcache_gen);
+                     inode, orig_gen, ci->i_rdcache_gen,
+                     ci->i_rdcache_revoking);
        }
        spin_unlock(&inode->i_lock);
@@ -1533,7 +1555,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *parent_inode = dentry->d_parent->d_inode;
        const unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
        int issued;
        int release = 0, dirtied = 0;
        int mask = 0;
@@ -1728,8 +1750,8 @@ out:
 */
 int ceph_do_getattr(struct inode *inode, int mask)
 {
-        struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err;
@@ -1738,7 +1760,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
                return 0;
        }
-        dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+        dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
        if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
                return 0;
@@ -1759,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
 * Check inode permissions.  We verify we have a valid value for
 * the AUTH cap, then call the generic handler.
 */
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+        int err;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
        if (!err)
-                err = generic_permission(inode, mask, NULL);
+                err = generic_permission(inode, mask, flags, NULL);
        return err;
 }
@@ -1788,7 +1815,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
                else
                        stat->dev = 0;
                if (S_ISDIR(inode->i_mode)) {
-                        stat->size = ci->i_rbytes;
+                        if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+                                                RBYTES))
+                                stat->size = ci->i_rbytes;
+                        else
+                                stat->size = ci->i_files + ci->i_subdirs;
                        stat->blocks = 0;
                        stat->blksize = 65536;
                }
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 76e307d2aba1..8888c9ba68db 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
 #include <linux/in.h>
-#include "ioctl.h"
 #include "super.h"
-#include "ceph_debug.h"
+#include "mds_client.h"
+#include <linux/ceph/ceph_debug.h>
+#include "ioctl.h"
 /*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_mds_request *req;
        struct ceph_ioctl_layout l;
        int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 }
 /*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_mds_request *req;
+        struct ceph_ioctl_layout l;
+        int err, i;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+        /* copy and validate */
+        if (copy_from_user(&l, arg, sizeof(l)))
+                return -EFAULT;
+        if ((l.object_size & ~PAGE_MASK) ||
+            (l.stripe_unit & ~PAGE_MASK) ||
+            !l.stripe_unit ||
+            (l.object_size &&
+                (unsigned)l.object_size % (unsigned)l.stripe_unit))
+                return -EINVAL;
+        /* make sure it's a valid data pool */
+        if (l.data_pool > 0) {
+                mutex_lock(&mdsc->mutex);
+                err = -EINVAL;
+                for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+                        if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+                                err = 0;
+                                break;
+                        }
+                mutex_unlock(&mdsc->mutex);
+                if (err)
+                        return err;
+        }
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+                                       USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_inode = igrab(inode);
+        req->r_args.setlayout.layout.fl_stripe_unit =
+                        cpu_to_le32(l.stripe_unit);
+        req->r_args.setlayout.layout.fl_stripe_count =
+                        cpu_to_le32(l.stripe_count);
+        req->r_args.setlayout.layout.fl_object_size =
+                        cpu_to_le32(l.object_size);
+        req->r_args.setlayout.layout.fl_pg_pool =
+                        cpu_to_le32(l.data_pool);
+        req->r_args.setlayout.layout.fl_pg_preferred =
+                        cpu_to_le32(l.preferred_osd);
+        err = ceph_mdsc_do_request(mdsc, inode, req);
+        ceph_mdsc_put_request(req);
+        return err;
+}
+/*
 * Return object name, size/offset information, and location (OSD
 * number, network address) for a given file offset.
 */
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        struct ceph_ioctl_dataloc dl;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc =
+                &ceph_sb_to_client(inode->i_sb)->client->osdc;
        u64 len = 1, olen;
        u64 tmp;
        struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case CEPH_IOC_SET_LAYOUT:
                return ceph_ioctl_set_layout(file, (void __user *)arg);
+        case CEPH_IOC_SET_LAYOUT_POLICY:
+                return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
        case CEPH_IOC_GET_DATALOC:
                return ceph_ioctl_get_dataloc(file, (void __user *)arg);
        case CEPH_IOC_LAZYIO:
                return ceph_ioctl_lazyio(file);
        }
        return -ENOTTY;
 }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 88451a3b6857..52e8fd74d450 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
                                   struct ceph_ioctl_layout)
 #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,           \
                                   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5,    \
+                                   struct ceph_ioctl_layout)
 /*
 * Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ff4e753aae92..476b329867d4 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,50 +1,78 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/file.h>
 #include <linux/namei.h>
 #include "super.h"
 #include "mds_client.h"
-#include "pagelist.h"
+#include <linux/ceph/pagelist.h>
 /**
 * Implement fcntl and flock locking functions.
 */
 static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
-                             u64 pid, u64 pid_ns,
+                             int cmd, u8 wait, struct file_lock *fl)
-                             int cmd, u64 start, u64 length, u8 wait)
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_mds_client *mdsc =
-                &ceph_sb_to_client(inode->i_sb)->mdsc;
+                ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_mds_request *req;
        int err;
+        u64 length = 0;
        req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
        req->r_inode = igrab(inode);
+        /* mds requires start and length rather than start and end */
+        if (LLONG_MAX == fl->fl_end)
+                length = 0;
+        else
+                length = fl->fl_end - fl->fl_start + 1;
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
             "length: %llu, wait: %d, type`: %d", (int)lock_type,
-             (int)operation, pid, start, length, wait, cmd);
+             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+             length, wait, fl->fl_type);
        req->r_args.filelock_change.rule = lock_type;
        req->r_args.filelock_change.type = cmd;
-        req->r_args.filelock_change.pid = cpu_to_le64(pid);
+        req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
        /* This should be adjusted, but I'm not sure if
           namespaces actually get id numbers*/
        req->r_args.filelock_change.pid_namespace =
-                cpu_to_le64((u64)pid_ns);
+                cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
-        req->r_args.filelock_change.start = cpu_to_le64(start);
+        req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
        req->r_args.filelock_change.length = cpu_to_le64(length);
        req->r_args.filelock_change.wait = wait;
        err = ceph_mdsc_do_request(mdsc, inode, req);
+        if ( operation == CEPH_MDS_OP_GETFILELOCK){
+                fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+                if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+                        fl->fl_type = F_RDLCK;
+                else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+                        fl->fl_type = F_WRLCK;
+                else
+                        fl->fl_type = F_UNLCK;
+                fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+                length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+                                                 le64_to_cpu(req->r_reply_info.filelock_reply->length);
+                if (length >= 1)
+                        fl->fl_end = length -1;
+                else
+                        fl->fl_end = 0;
+        }
        ceph_mdsc_put_request(req);
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-             "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
+             "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
-             (int)operation, pid, start, length, wait, cmd, err);
+             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+             length, wait, fl->fl_type, err);
        return err;
 }
@@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 */
 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-        u64 length;
        u8 lock_cmd;
        int err;
        u8 wait = 0;
@@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
-        if (LLONG_MAX == fl->fl_end)
+        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
-                length = 0;
-        else
-                length = fl->fl_end - fl->fl_start + 1;
-        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                (u64)fl->fl_pid,
-                                (u64)(unsigned long)fl->fl_nspid,
-                                lock_cmd, fl->fl_start,
-                                length, wait);
        if (!err) {
-                dout("mds locked, locking locally");
+                if ( op != CEPH_MDS_OP_GETFILELOCK ){
-                err = posix_lock_file(file, fl, NULL);
+                        dout("mds locked, locking locally");
-                if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+                        err = posix_lock_file(file, fl, NULL);
-                        /* undo! This should only happen if the kernel detects
+                        if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
-                         * local deadlock. */
+                                /* undo! This should only happen if the kernel detects
-                        ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+                                 * local deadlock. */
-                                          (u64)fl->fl_pid,
+                                ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                          (u64)(unsigned long)fl->fl_nspid,
+                                                  CEPH_LOCK_UNLOCK, 0, fl);
-                                          CEPH_LOCK_UNLOCK, fl->fl_start,
+                                dout("got %d on posix_lock_file, undid lock", err);
-                                          length, 0);
+                        }
-                        dout("got %d on posix_lock_file, undid lock", err);
                }
        } else {
                dout("mds returned error code %d", err);
        }
@@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-        u64 length;
        u8 lock_cmd;
        int err;
        u8 wait = 1;
@@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                lock_cmd = CEPH_LOCK_EXCL;
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
-        /* mds requires start and length rather than start and end */
-        if (LLONG_MAX == fl->fl_end)
-                length = 0;
-        else
-                length = fl->fl_end - fl->fl_start + 1;
        err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
-                                file, (u64)fl->fl_pid,
+                                file, lock_cmd, wait, fl);
-                                (u64)(unsigned long)fl->fl_nspid,
-                                lock_cmd, fl->fl_start,
-                                length, wait);
        if (!err) {
                err = flock_lock_file_wait(file, fl);
                if (err) {
                        ceph_lock_message(CEPH_LOCK_FLOCK,
                                          CEPH_MDS_OP_SETFILELOCK,
-                                          file, (u64)fl->fl_pid,
+                                          file, CEPH_LOCK_UNLOCK, 0, fl);
-                                          (u64)(unsigned long)fl->fl_nspid,
-                                          CEPH_LOCK_UNLOCK, fl->fl_start,
-                                          length, 0);
                        dout("got %d on flock_lock_file_wait, undid lock", err);
                }
        } else {
@@ -181,8 +187,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 * Encode the flock and fcntl locks for the given inode into the pagelist.
 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
 * sequential flock locks.
- * Must be called with BLK already held, and the lock numbers should have
+ * Must be called with lock_flocks() already held.
- * been gathered under the same lock holding window.
+ * If we encounter more of a specific lock type than expected,
+ * we return the value 1.
 */
 int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                      int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +197,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
        struct file_lock *lock;
        struct ceph_filelock cephlock;
        int err = 0;
+        int seen_fcntl = 0;
+        int seen_flock = 0;
        dout("encoding %d flock and %d fcntl locks", num_flock_locks,
             num_fcntl_locks);
@@ -198,6 +207,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                goto fail;
        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
                if (lock->fl_flags & FL_POSIX) {
+                        ++seen_fcntl;
+                        if (seen_fcntl > num_fcntl_locks) {
+                                err = -ENOSPC;
+                                goto fail;
+                        }
                        err = lock_to_ceph_filelock(lock, &cephlock);
                        if (err)
                                goto fail;
@@ -213,6 +227,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                goto fail;
        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
                if (lock->fl_flags & FL_FLOCK) {
+                        ++seen_flock;
+                        if (seen_flock > num_flock_locks) {
+                                err = -ENOSPC;
+                                goto fail;
+                        }
                        err = lock_to_ceph_filelock(lock, &cephlock);
                        if (err)
                                goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fad95f8f2608..a1ee8fa3a8e7 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,20 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
-#include "mds_client.h"
-#include "mon_client.h"
 #include "super.h"
-#include "messenger.h"
+#include "mds_client.h"
-#include "decode.h"
-#include "auth.h"
+#include <linux/ceph/messenger.h>
-#include "pagelist.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 /*
 * A cluster of MDS (metadata server) daemons is responsible for
@@ -57,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
 * parse individual inode info
 */
 static int parse_reply_info_in(void **p, void *end,
-                               struct ceph_mds_reply_info_in *info)
+                               struct ceph_mds_reply_info_in *info,
+                               int features)
 {
        int err = -EIO;
@@ -71,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
        info->symlink = *p;
        *p += info->symlink_len;
+        if (features & CEPH_FEATURE_DIRLAYOUTHASH)
+                ceph_decode_copy_safe(p, end, &info->dir_layout,
+                                      sizeof(info->dir_layout), bad);
+        else
+                memset(&info->dir_layout, 0, sizeof(info->dir_layout));
        ceph_decode_32_safe(p, end, info->xattr_len, bad);
        ceph_decode_need(p, end, info->xattr_len, bad);
        info->xattr_data = *p;
@@ -85,12 +95,13 @@ bad:
 * target inode.
 */
 static int parse_reply_info_trace(void **p, void *end,
-                                  struct ceph_mds_reply_info_parsed *info)
+                                  struct ceph_mds_reply_info_parsed *info,
+                                  int features)
 {
        int err;
        if (info->head->is_dentry) {
-                err = parse_reply_info_in(p, end, &info->diri);
+                err = parse_reply_info_in(p, end, &info->diri, features);
                if (err < 0)
                        goto out_bad;
@@ -111,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
        }
        if (info->head->is_target) {
-                err = parse_reply_info_in(p, end, &info->targeti);
+                err = parse_reply_info_in(p, end, &info->targeti, features);
                if (err < 0)
                        goto out_bad;
        }
@@ -131,7 +142,8 @@ out_bad:
 * parse readdir results
 */
 static int parse_reply_info_dir(void **p, void *end,
-                                struct ceph_mds_reply_info_parsed *info)
+                                struct ceph_mds_reply_info_parsed *info,
+                                int features)
 {
        u32 num, i = 0;
        int err;
@@ -179,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
                *p += sizeof(struct ceph_mds_reply_lease);
                /* inode */
-                err = parse_reply_info_in(p, end, &info->dir_in[i]);
+                err = parse_reply_info_in(p, end, &info->dir_in[i], features);
                if (err < 0)
                        goto out_bad;
                i++;
@@ -199,10 +211,45 @@ out_bad:
 }
 /*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+                                     struct ceph_mds_reply_info_parsed *info,
+                                     int features)
+{
+        if (*p + sizeof(*info->filelock_reply) > end)
+                goto bad;
+        info->filelock_reply = *p;
+        *p += sizeof(*info->filelock_reply);
+        if (unlikely(*p != end))
+                goto bad;
+        return 0;
+bad:
+        return -EIO;
+}
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+                                  struct ceph_mds_reply_info_parsed *info,
+                                  int features)
+{
+        if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+                return parse_reply_info_filelock(p, end, info, features);
+        else
+                return parse_reply_info_dir(p, end, info, features);
+}
+/*
 * parse entire mds reply
 */
 static int parse_reply_info(struct ceph_msg *msg,
-                            struct ceph_mds_reply_info_parsed *info)
+                            struct ceph_mds_reply_info_parsed *info,
+                            int features)
 {
        void *p, *end;
        u32 len;
@@ -215,15 +262,15 @@ static int parse_reply_info(struct ceph_msg *msg,
        /* trace */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
-                err = parse_reply_info_trace(&p, p+len, info);
+                err = parse_reply_info_trace(&p, p+len, info, features);
                if (err < 0)
                        goto out_bad;
        }
-        /* dir content */
+        /* extra */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
-                err = parse_reply_info_dir(&p, p+len, info);
+                err = parse_reply_info_extra(&p, p+len, info, features);
                if (err < 0)
                        goto out_bad;
        }
@@ -286,8 +333,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
             atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
        if (atomic_dec_and_test(&s->s_ref)) {
                if (s->s_authorizer)
-                        s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
+                     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
-                                s->s_mdsc->client->monc.auth, s->s_authorizer);
+                             s->s_mdsc->fsc->client->monc.auth,
+                             s->s_authorizer);
                kfree(s);
        }
 }
@@ -344,7 +392,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
        s->s_seq = 0;
        mutex_init(&s->s_mutex);
-        ceph_con_init(mdsc->client->msgr, &s->s_con);
+        ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
        s->s_con.private = s;
        s->s_con.ops = &mds_con_ops;
        s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -524,6 +572,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
        ceph_mdsc_get_request(req);
        __insert_request(mdsc, req);
+        req->r_uid = current_fsuid();
+        req->r_gid = current_fsgid();
        if (dir) {
                struct ceph_inode_info *ci = ceph_inode(dir);
@@ -599,7 +650,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        } else if (req->r_dentry) {
                struct inode *dir = req->r_dentry->d_parent->d_inode;
-                if (dir->i_sb != mdsc->client->sb) {
+                if (dir->i_sb != mdsc->fsc->sb) {
                        /* not this fs! */
                        inode = req->r_dentry->d_inode;
                } else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -615,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                } else {
                        /* dir + name */
                        inode = dir;
-                        hash = req->r_dentry->d_name.hash;
+                        hash = ceph_dentry_hash(req->r_dentry);
                        is_hash = true;
                }
        }
@@ -642,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                dout("choose_mds %p %llx.%llx "
                                     "frag %u mds%d (%d/%d)\n",
                                     inode, ceph_vinop(inode),
-                                     frag.frag, frag.mds,
+                                     frag.frag, mds,
                                     (int)r, frag.ndist);
-                                return mds;
+                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+                                    CEPH_MDS_STATE_ACTIVE)
+                                        return mds;
                        }
                        /* since this file/dir wasn't known to be
@@ -657,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                dout("choose_mds %p %llx.%llx "
                                     "frag %u mds%d (auth)\n",
                                     inode, ceph_vinop(inode), frag.frag, mds);
-                                return mds;
+                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+                                    CEPH_MDS_STATE_ACTIVE)
+                                        return mds;
                        }
                }
        }
@@ -884,7 +939,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
        __ceph_remove_cap(cap);
        if (!__ceph_is_any_real_caps(ci)) {
                struct ceph_mds_client *mdsc =
-                        &ceph_sb_to_client(inode->i_sb)->mdsc;
+                        ceph_sb_to_client(inode->i_sb)->mdsc;
                spin_lock(&mdsc->cap_dirty_lock);
                if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1201,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg, *partial = NULL;
        struct ceph_mds_cap_release *head;
        int err = -ENOMEM;
-        int extra = mdsc->client->mount_args->cap_release_safety;
+        int extra = mdsc->fsc->mount_options->cap_release_safety;
        int num;
        dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -1447,7 +1502,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
        dout("build_path on %p %d built %llx '%.*s'\n",
-             dentry, atomic_read(&dentry->d_count), *base, len, path);
+             dentry, dentry->d_count, *base, len, path);
        return path;
 }
@@ -1583,8 +1638,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
        head->op = cpu_to_le32(req->r_op);
-        head->caller_uid = cpu_to_le32(current_fsuid());
+        head->caller_uid = cpu_to_le32(req->r_uid);
-        head->caller_gid = cpu_to_le32(current_fsgid());
+        head->caller_gid = cpu_to_le32(req->r_gid);
        head->args = req->r_args;
        ceph_encode_filepath(&p, end, ino1, path1);
@@ -1654,7 +1709,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        int flags = 0;
-        req->r_mds = mds;
        req->r_attempts++;
        if (req->r_inode) {
                struct ceph_cap *cap =
@@ -1741,6 +1795,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
                goto finish;
        }
+        put_request_session(req);
        mds = __choose_mds(mdsc, req);
        if (mds < 0 ||
            ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1758,6 +1814,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
                        goto finish;
                }
        }
+        req->r_session = get_session(session);
        dout("do_request mds%d session %p state %s\n", mds, session,
             session_state_name(session->s_state));
        if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1770,7 +1828,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
        }
        /* send request */
-        req->r_session = get_session(session);
        req->r_resend_mds = -1;   /* forget any previous mds hint */
        if (req->r_request_started == 0)   /* note request start time */
@@ -1824,7 +1881,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
                if (req->r_session &&
                    req->r_session->s_mds == mds) {
                        dout(" kicking tid %llu\n", req->r_tid);
-                        put_request_session(req);
                        __do_request(mdsc, req);
                }
        }
@@ -2017,8 +2073,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        goto out;
                } else  {
                        struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-                        struct ceph_cap *cap =
+                        struct ceph_cap *cap = NULL;
-                                ceph_get_cap_for_mds(ci, req->r_mds);;
+                        if (req->r_session)
+                                cap = ceph_get_cap_for_mds(ci,
+                                                   req->r_session->s_mds);
                        dout("already using auth");
                        if ((!cap || cap != ci->i_auth_cap) ||
@@ -2062,12 +2121,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        dout("handle_reply tid %lld result %d\n", tid, result);
        rinfo = &req->r_reply_info;
-        err = parse_reply_info(msg, rinfo);
+        err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
        mutex_unlock(&mdsc->mutex);
        mutex_lock(&session->s_mutex);
        if (err < 0) {
-                pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+                pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
                ceph_msg_dump(msg);
                goto out_err;
        }
@@ -2085,9 +2144,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        /* insert trace into our cache */
        mutex_lock(&req->r_fill_mutex);
-        err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+        err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
        if (err == 0) {
-                if (result == 0 && rinfo->dir_nr)
+                if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+                    rinfo->dir_nr)
                        ceph_readdir_prepopulate(req, req->r_session);
                ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
        }
@@ -2361,19 +2421,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        if (recon_state->flock) {
                int num_fcntl_locks, num_flock_locks;
+                struct ceph_pagelist_cursor trunc_point;
-                lock_kernel();
-                ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+                ceph_pagelist_set_cursor(pagelist, &trunc_point);
-                rec.v2.flock_len = (2*sizeof(u32) +
+                do {
-                                    (num_fcntl_locks+num_flock_locks) *
+                        lock_flocks();
-                                    sizeof(struct ceph_filelock));
+                        ceph_count_locks(inode, &num_fcntl_locks,
+                                         &num_flock_locks);
-                err = ceph_pagelist_append(pagelist, &rec, reclen);
+                        rec.v2.flock_len = (2*sizeof(u32) +
-                if (!err)
+                                            (num_fcntl_locks+num_flock_locks) *
-                        err = ceph_encode_locks(inode, pagelist,
+                                            sizeof(struct ceph_filelock));
-                                                num_fcntl_locks,
+                        unlock_flocks();
-                                                num_flock_locks);
-                unlock_kernel();
+                        /* pre-alloc pagelist */
+                        ceph_pagelist_truncate(pagelist, &trunc_point);
+                        err = ceph_pagelist_append(pagelist, &rec, reclen);
+                        if (!err)
+                                err = ceph_pagelist_reserve(pagelist,
+                                                            rec.v2.flock_len);
+                        /* encode locks */
+                        if (!err) {
+                                lock_flocks();
+                                err = ceph_encode_locks(inode,
+                                                        pagelist,
+                                                        num_fcntl_locks,
+                                                        num_flock_locks);
+                                unlock_flocks();
+                        }
+                } while (err == -ENOSPC);
        } else {
                err = ceph_pagelist_append(pagelist, &rec, reclen);
        }
@@ -2613,7 +2689,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                         struct ceph_mds_session *session,
                         struct ceph_msg *msg)
 {
-        struct super_block *sb = mdsc->client->sb;
+        struct super_block *sb = mdsc->fsc->sb;
        struct inode *inode;
        struct ceph_inode_info *ci;
        struct dentry *parent, *dentry;
@@ -2891,10 +2967,16 @@ static void delayed_work(struct work_struct *work)
        schedule_delayed(mdsc);
 }
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
-int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 {
-        mdsc->client = client;
+        struct ceph_mds_client *mdsc;
+        mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+        if (!mdsc)
+                return -ENOMEM;
+        mdsc->fsc = fsc;
+        fsc->mdsc = mdsc;
        mutex_init(&mdsc->mutex);
        mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
        if (mdsc->mdsmap == NULL)
@@ -2927,7 +3009,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        INIT_LIST_HEAD(&mdsc->dentry_lru);
        ceph_caps_init(mdsc);
-        ceph_adjust_min_caps(mdsc, client->min_caps);
+        ceph_adjust_min_caps(mdsc, fsc->min_caps);
        return 0;
 }
@@ -2939,7 +3021,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
        struct ceph_mds_request *req;
-        struct ceph_client *client = mdsc->client;
+        struct ceph_fs_client *fsc = mdsc->fsc;
        mutex_lock(&mdsc->mutex);
        if (__get_oldest_req(mdsc)) {
@@ -2947,7 +3029,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
                dout("wait_requests waiting for requests\n");
                wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-                                    client->mount_args->mount_timeout * HZ);
+                                    fsc->client->options->mount_timeout * HZ);
                /* tear down remaining requests */
                mutex_lock(&mdsc->mutex);
@@ -3030,7 +3112,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush;
-        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+        if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                return;
        dout("sync\n");
@@ -3053,7 +3135,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
        int i, n = 0;
-        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+        if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                return true;
        mutex_lock(&mdsc->mutex);
@@ -3071,8 +3153,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
        struct ceph_mds_session *session;
        int i;
-        struct ceph_client *client = mdsc->client;
+        struct ceph_fs_client *fsc = mdsc->fsc;
-        unsigned long timeout = client->mount_args->mount_timeout * HZ;
+        unsigned long timeout = fsc->client->options->mount_timeout * HZ;
        dout("close_sessions\n");
@@ -3119,7 +3201,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
        dout("stopped\n");
 }
-void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
        dout("stop\n");
        cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3211,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
        ceph_caps_finalize(mdsc);
 }
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+        struct ceph_mds_client *mdsc = fsc->mdsc;
+        ceph_mdsc_stop(mdsc);
+        fsc->mdsc = NULL;
+        kfree(mdsc);
+}
 /*
 * handle mds map update.
@@ -3145,14 +3236,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
        ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
        ceph_decode_copy(&p, &fsid, sizeof(fsid));
-        if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+        if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
                return;
        epoch = ceph_decode_32(&p);
        maplen = ceph_decode_32(&p);
        dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
        /* do we need it? */
-        ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+        ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
        mutex_lock(&mdsc->mutex);
        if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
                dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3267,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
        } else {
                mdsc->mdsmap = newmap;  /* first mds map */
        }
-        mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+        mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
        __wake_requests(mdsc, &mdsc->waiting_for_map);
@@ -3277,7 +3368,7 @@ static int get_authorizer(struct ceph_connection *con,
 {
        struct ceph_mds_session *s = con->private;
        struct ceph_mds_client *mdsc = s->s_mdsc;
-        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
        int ret = 0;
        if (force_new && s->s_authorizer) {
@@ -3311,7 +3402,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 {
        struct ceph_mds_session *s = con->private;
        struct ceph_mds_client *mdsc = s->s_mdsc;
-        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
        return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
 }
@@ -3320,12 +3411,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
 {
        struct ceph_mds_session *s = con->private;
        struct ceph_mds_client *mdsc = s->s_mdsc;
-        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
        if (ac->ops->invalidate_authorizer)
                ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
-        return ceph_monc_validate_auth(&mdsc->client->monc);
+        return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3429,4 @@ static const struct ceph_connection_operations mds_con_ops = {
        .peer_reset = peer_reset,
 };
 /* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c98267ce6d2a..4e3a9cc0bba6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
-#include "types.h"
+#include <linux/ceph/types.h>
-#include "messenger.h"
+#include <linux/ceph/messenger.h>
-#include "mdsmap.h"
+#include <linux/ceph/mdsmap.h>
 /*
 * Some lock dependencies:
@@ -26,7 +26,7 @@
 *
 */
-struct ceph_client;
+struct ceph_fs_client;
 struct ceph_cap;
 /*
@@ -35,6 +35,7 @@ struct ceph_cap;
 */
 struct ceph_mds_reply_info_in {
        struct ceph_mds_reply_inode *in;
+        struct ceph_dir_layout dir_layout;
        u32 symlink_len;
        char *symlink;
        u32 xattr_len;
@@ -42,26 +43,37 @@ struct ceph_mds_reply_info_in {
 };
 /*
- * parsed info about an mds reply, including information about the
+ * parsed info about an mds reply, including information about
- * target inode and/or its parent directory and dentry, and directory
+ * either: 1) the target inode and/or its parent directory and dentry,
- * contents (for readdir results).
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
 */
 struct ceph_mds_reply_info_parsed {
        struct ceph_mds_reply_head    *head;
+        /* trace */
        struct ceph_mds_reply_info_in diri, targeti;
        struct ceph_mds_reply_dirfrag *dirfrag;
        char                          *dname;
        u32                           dname_len;
        struct ceph_mds_reply_lease   *dlease;
-        struct ceph_mds_reply_dirfrag *dir_dir;
+        /* extra */
-        int                           dir_nr;
+        union {
-        char                          **dir_dname;
+                /* for fcntl F_GETLK results */
-        u32                           *dir_dname_len;
+                struct ceph_filelock *filelock_reply;
-        struct ceph_mds_reply_lease   **dir_dlease;
-        struct ceph_mds_reply_info_in *dir_in;
+                /* for readdir results */
-        u8                            dir_complete, dir_end;
+                struct {
+                        struct ceph_mds_reply_dirfrag *dir_dir;
+                        int                           dir_nr;
+                        char                          **dir_dname;
+                        u32                           *dir_dname_len;
+                        struct ceph_mds_reply_lease   **dir_dlease;
+                        struct ceph_mds_reply_info_in *dir_in;
+                        u8                            dir_complete, dir_end;
+                };
+        };
        /* encoded blob describing snapshot contexts for certain
           operations (e.g., open) */
@@ -154,7 +166,6 @@ struct ceph_mds_request {
        struct ceph_mds_client *r_mdsc;
        int r_op;                    /* mds op code */
-        int r_mds;
        /* operation on what? */
        struct inode *r_inode;              /* arg1 */
@@ -170,6 +181,8 @@ struct ceph_mds_request {
        union ceph_mds_request_args r_args;
        int r_fmode;        /* file mode, if expecting cap */
+        uid_t r_uid;
+        gid_t r_gid;
        /* for choosing which mds to send this request to */
        int r_direct_mode;
@@ -230,7 +243,7 @@ struct ceph_mds_request {
 * mds client state
 */
 struct ceph_mds_client {
-        struct ceph_client      *client;
+        struct ceph_fs_client  *fsc;
        struct mutex            mutex;         /* all nested structures */
        struct ceph_mdsmap      *mdsmap;
@@ -289,11 +302,6 @@ struct ceph_mds_client {
        int             caps_avail_count;    /* unused, unreserved */
        int             caps_min_count;      /* keep at least this many
                                                (unreserved) */
-#ifdef CONFIG_DEBUG_FS
-        struct dentry     *debugfs_file;
-#endif
        spinlock_t        dentry_lru_lock;
        struct list_head  dentry_lru;
        int               num_dentry;
@@ -316,10 +324,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
                             struct ceph_msg *msg, int mds);
-extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
-                           struct ceph_client *client);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 040be6d1150b..73b7d44e8a35 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/bug.h>
 #include <linux/err.h>
@@ -6,9 +6,9 @@
 #include <linux/slab.h>
 #include <linux/types.h>
-#include "mdsmap.h"
+#include <linux/ceph/mdsmap.h>
-#include "messenger.h"
+#include <linux/ceph/messenger.h>
-#include "decode.h"
+#include <linux/ceph/decode.h>
 #include "super.h"
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                }
                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
-                     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+                     i+1, n, global_id, mds, inc,
+                     ceph_pr_addr(&addr.in_addr),
                     ceph_mds_state_name(state));
                if (mds >= 0 && mds < m->m_max_mds && state > 0) {
                        m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
deleted file mode 100644
index 4c5cb0880bba..000000000000
--- a/fs/ceph/mdsmap.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef _FS_CEPH_MDSMAP_H
-#define _FS_CEPH_MDSMAP_H
-#include "types.h"
-/*
- * mds map - describe servers in the mds cluster.
- *
- * we limit fields to those the client actually xcares about
- */
-struct ceph_mds_info {
-        u64 global_id;
-        struct ceph_entity_addr addr;
-        s32 state;
-        int num_export_targets;
-        bool laggy;
-        u32 *export_targets;
-};
-struct ceph_mdsmap {
-        u32 m_epoch, m_client_epoch, m_last_failure;
-        u32 m_root;
-        u32 m_session_timeout;          /* seconds */
-        u32 m_session_autoclose;        /* seconds */
-        u64 m_max_file_size;
-        u32 m_max_mds;                  /* size of m_addr, m_state arrays */
-        struct ceph_mds_info *m_info;
-        /* which object pools file data can be stored in */
-        int m_num_data_pg_pools;
-        u32 *m_data_pg_pools;
-        u32 m_cas_pg_pool;
-};
-static inline struct ceph_entity_addr *
-ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
-{
-        if (w >= m->m_max_mds)
-                return NULL;
-        return &m->m_info[w].addr;
-}
-static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
-{
-        BUG_ON(w < 0);
-        if (w >= m->m_max_mds)
-                return CEPH_MDS_STATE_DNE;
-        return m->m_info[w].state;
-}
-static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
-{
-        if (w >= 0 && w < m->m_max_mds)
-                return m->m_info[w].laggy;
-        return false;
-}
-extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
-extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
-#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
deleted file mode 100644
index 2502d76fcec1..000000000000
--- a/fs/ceph/messenger.c
+++ /dev/null
@@ -1,2277 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/crc32c.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/inet.h>
-#include <linux/kthread.h>
-#include <linux/net.h>
-#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/string.h>
-#include <net/tcp.h>
-#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "pagelist.h"
-/*
- * Ceph uses the messenger to exchange ceph_msg messages with other
- * hosts in the system.  The messenger provides ordered and reliable
- * delivery.  We tolerate TCP disconnects by reconnecting (with
- * exponential backoff) in the case of a fault (disconnection, bad
- * crc, protocol error).  Acks allow sent messages to be discarded by
- * the sender.
- */
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-#ifdef CONFIG_LOCKDEP
-static struct lock_class_key socket_class;
-#endif
-static void queue_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
-static void ceph_fault(struct ceph_connection *con);
-/*
- * nicely render a sockaddr as a string.
- */
-#define MAX_ADDR_STR 20
-#define MAX_ADDR_STR_LEN 60
-static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
-static DEFINE_SPINLOCK(addr_str_lock);
-static int last_addr_str;
-const char *pr_addr(const struct sockaddr_storage *ss)
-{
-        int i;
-        char *s;
-        struct sockaddr_in *in4 = (void *)ss;
-        struct sockaddr_in6 *in6 = (void *)ss;
-        spin_lock(&addr_str_lock);
-        i = last_addr_str++;
-        if (last_addr_str == MAX_ADDR_STR)
-                last_addr_str = 0;
-        spin_unlock(&addr_str_lock);
-        s = addr_str[i];
-        switch (ss->ss_family) {
-        case AF_INET:
-                snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
-                         (unsigned int)ntohs(in4->sin_port));
-                break;
-        case AF_INET6:
-                snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
-                         (unsigned int)ntohs(in6->sin6_port));
-                break;
-        default:
-                sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
-        }
-        return s;
-}
-static void encode_my_addr(struct ceph_messenger *msgr)
-{
-        memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-        ceph_encode_addr(&msgr->my_enc_addr);
-}
-/*
- * work queue for all reading and writing to/from the socket.
- */
-struct workqueue_struct *ceph_msgr_wq;
-int __init ceph_msgr_init(void)
-{
-        ceph_msgr_wq = create_workqueue("ceph-msgr");
-        if (IS_ERR(ceph_msgr_wq)) {
-                int ret = PTR_ERR(ceph_msgr_wq);
-                pr_err("msgr_init failed to create workqueue: %d\n", ret);
-                ceph_msgr_wq = NULL;
-                return ret;
-        }
-        return 0;
-}
-void ceph_msgr_exit(void)
-{
-        destroy_workqueue(ceph_msgr_wq);
-}
-void ceph_msgr_flush(void)
-{
-        flush_workqueue(ceph_msgr_wq);
-}
-/*
- * socket callback functions
- */
-/* data available on socket, or listen socket received a connect */
-static void ceph_data_ready(struct sock *sk, int count_unused)
-{
-        struct ceph_connection *con =
-                (struct ceph_connection *)sk->sk_user_data;
-        if (sk->sk_state != TCP_CLOSE_WAIT) {
-                dout("ceph_data_ready on %p state = %lu, queueing work\n",
-                     con, con->state);
-                queue_con(con);
-        }
-}
-/* socket has buffer space for writing */
-static void ceph_write_space(struct sock *sk)
-{
-        struct ceph_connection *con =
-                (struct ceph_connection *)sk->sk_user_data;
-        /* only queue to workqueue if there is data we want to write. */
-        if (test_bit(WRITE_PENDING, &con->state)) {
-                dout("ceph_write_space %p queueing write work\n", con);
-                queue_con(con);
-        } else {
-                dout("ceph_write_space %p nothing to write\n", con);
-        }
-        /* since we have our own write_space, clear the SOCK_NOSPACE flag */
-        clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-}
-/* socket's state has changed */
-static void ceph_state_change(struct sock *sk)
-{
-        struct ceph_connection *con =
-                (struct ceph_connection *)sk->sk_user_data;
-        dout("ceph_state_change %p state = %lu sk_state = %u\n",
-             con, con->state, sk->sk_state);
-        if (test_bit(CLOSED, &con->state))
-                return;
-        switch (sk->sk_state) {
-        case TCP_CLOSE:
-                dout("ceph_state_change TCP_CLOSE\n");
-        case TCP_CLOSE_WAIT:
-                dout("ceph_state_change TCP_CLOSE_WAIT\n");
-                if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
-                        if (test_bit(CONNECTING, &con->state))
-                                con->error_msg = "connection failed";
-                        else
-                                con->error_msg = "socket closed";
-                        queue_con(con);
-                }
-                break;
-        case TCP_ESTABLISHED:
-                dout("ceph_state_change TCP_ESTABLISHED\n");
-                queue_con(con);
-                break;
-        }
-}
-/*
- * set up socket callbacks
- */
-static void set_sock_callbacks(struct socket *sock,
-                               struct ceph_connection *con)
-{
-        struct sock *sk = sock->sk;
-        sk->sk_user_data = (void *)con;
-        sk->sk_data_ready = ceph_data_ready;
-        sk->sk_write_space = ceph_write_space;
-        sk->sk_state_change = ceph_state_change;
-}
-/*
- * socket helpers
- */
-/*
- * initiate connection to a remote socket.
- */
-static struct socket *ceph_tcp_connect(struct ceph_connection *con)
-{
-        struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
-        struct socket *sock;
-        int ret;
-        BUG_ON(con->sock);
-        ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
-                               IPPROTO_TCP, &sock);
-        if (ret)
-                return ERR_PTR(ret);
-        con->sock = sock;
-        sock->sk->sk_allocation = GFP_NOFS;
-#ifdef CONFIG_LOCKDEP
-        lockdep_set_class(&sock->sk->sk_lock, &socket_class);
-#endif
-        set_sock_callbacks(sock, con);
-        dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-        ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
-                                 O_NONBLOCK);
-        if (ret == -EINPROGRESS) {
-                dout("connect %s EINPROGRESS sk_state = %u\n",
-                     pr_addr(&con->peer_addr.in_addr),
-                     sock->sk->sk_state);
-                ret = 0;
-        }
-        if (ret < 0) {
-                pr_err("connect %s error %d\n",
-                       pr_addr(&con->peer_addr.in_addr), ret);
-                sock_release(sock);
-                con->sock = NULL;
-                con->error_msg = "connect error";
-        }
-        if (ret < 0)
-                return ERR_PTR(ret);
-        return sock;
-}
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
-        struct kvec iov = {buf, len};
-        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-        return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
-}
-/*
- * write something.  @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
-                     size_t kvlen, size_t len, int more)
-{
-        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-        if (more)
-                msg.msg_flags |= MSG_MORE;
-        else
-                msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
-        return kernel_sendmsg(sock, &msg, iov, kvlen, len);
-}
-/*
- * Shutdown/close the socket for the given connection.
- */
-static int con_close_socket(struct ceph_connection *con)
-{
-        int rc;
-        dout("con_close_socket on %p sock %p\n", con, con->sock);
-        if (!con->sock)
-                return 0;
-        set_bit(SOCK_CLOSED, &con->state);
-        rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
-        sock_release(con->sock);
-        con->sock = NULL;
-        clear_bit(SOCK_CLOSED, &con->state);
-        return rc;
-}
-/*
- * Reset a connection.  Discard all incoming and outgoing messages
- * and clear *_seq state.
- */
-static void ceph_msg_remove(struct ceph_msg *msg)
-{
-        list_del_init(&msg->list_head);
-        ceph_msg_put(msg);
-}
-static void ceph_msg_remove_list(struct list_head *head)
-{
-        while (!list_empty(head)) {
-                struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
-                                                        list_head);
-                ceph_msg_remove(msg);
-        }
-}
-static void reset_connection(struct ceph_connection *con)
-{
-        /* reset connection, out_queue, msg_ and connect_seq */
-        /* discard existing out_queue and msg_seq */
-        ceph_msg_remove_list(&con->out_queue);
-        ceph_msg_remove_list(&con->out_sent);
-        if (con->in_msg) {
-                ceph_msg_put(con->in_msg);
-                con->in_msg = NULL;
-        }
-        con->connect_seq = 0;
-        con->out_seq = 0;
-        if (con->out_msg) {
-                ceph_msg_put(con->out_msg);
-                con->out_msg = NULL;
-        }
-        con->out_keepalive_pending = false;
-        con->in_seq = 0;
-        con->in_seq_acked = 0;
-}
-/*
- * mark a peer down.  drop any open connections.
- */
-void ceph_con_close(struct ceph_connection *con)
-{
-        dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
-        set_bit(CLOSED, &con->state);  /* in case there's queued work */
-        clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
-        clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
-        clear_bit(KEEPALIVE_PENDING, &con->state);
-        clear_bit(WRITE_PENDING, &con->state);
-        mutex_lock(&con->mutex);
-        reset_connection(con);
-        con->peer_global_seq = 0;
-        cancel_delayed_work(&con->work);
-        mutex_unlock(&con->mutex);
-        queue_con(con);
-}
-/*
- * Reopen a closed connection, with a new peer address.
- */
-void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
-{
-        dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
-        set_bit(OPENING, &con->state);
-        clear_bit(CLOSED, &con->state);
-        memcpy(&con->peer_addr, addr, sizeof(*addr));
-        con->delay = 0;      /* reset backoff memory */
-        queue_con(con);
-}
-/*
- * return true if this connection ever successfully opened
- */
-bool ceph_con_opened(struct ceph_connection *con)
-{
-        return con->connect_seq > 0;
-}
-/*
- * generic get/put
- */
-struct ceph_connection *ceph_con_get(struct ceph_connection *con)
-{
-        dout("con_get %p nref = %d -> %d\n", con,
-             atomic_read(&con->nref), atomic_read(&con->nref) + 1);
-        if (atomic_inc_not_zero(&con->nref))
-                return con;
-        return NULL;
-}
-void ceph_con_put(struct ceph_connection *con)
-{
-        dout("con_put %p nref = %d -> %d\n", con,
-             atomic_read(&con->nref), atomic_read(&con->nref) - 1);
-        BUG_ON(atomic_read(&con->nref) == 0);
-        if (atomic_dec_and_test(&con->nref)) {
-                BUG_ON(con->sock);
-                kfree(con);
-        }
-}
-/*
- * initialize a new connection.
- */
-void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
-{
-        dout("con_init %p\n", con);
-        memset(con, 0, sizeof(*con));
-        atomic_set(&con->nref, 1);
-        con->msgr = msgr;
-        mutex_init(&con->mutex);
-        INIT_LIST_HEAD(&con->out_queue);
-        INIT_LIST_HEAD(&con->out_sent);
-        INIT_DELAYED_WORK(&con->work, con_work);
-}
-/*
- * We maintain a global counter to order connection attempts.  Get
- * a unique seq greater than @gt.
- */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
-{
-        u32 ret;
-        spin_lock(&msgr->global_seq_lock);
-        if (msgr->global_seq < gt)
-                msgr->global_seq = gt;
-        ret = ++msgr->global_seq;
-        spin_unlock(&msgr->global_seq_lock);
-        return ret;
-}
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off.  Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con, int v)
-{
-        struct ceph_msg *m = con->out_msg;
-        dout("prepare_write_message_footer %p\n", con);
-        con->out_kvec_is_msg = true;
-        con->out_kvec[v].iov_base = &m->footer;
-        con->out_kvec[v].iov_len = sizeof(m->footer);
-        con->out_kvec_bytes += sizeof(m->footer);
-        con->out_kvec_left++;
-        con->out_more = m->more_to_follow;
-        con->out_msg_done = true;
-}
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
-        struct ceph_msg *m;
-        int v = 0;
-        con->out_kvec_bytes = 0;
-        con->out_kvec_is_msg = true;
-        con->out_msg_done = false;
-        /* Sneak an ack in there first?  If we can get it into the same
-         * TCP packet that's a good thing. */
-        if (con->in_seq > con->in_seq_acked) {
-                con->in_seq_acked = con->in_seq;
-                con->out_kvec[v].iov_base = &tag_ack;
-                con->out_kvec[v++].iov_len = 1;
-                con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-                con->out_kvec[v].iov_base = &con->out_temp_ack;
-                con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
-                con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-        }
-        m = list_first_entry(&con->out_queue,
-                       struct ceph_msg, list_head);
-        con->out_msg = m;
-        if (test_bit(LOSSYTX, &con->state)) {
-                list_del_init(&m->list_head);
-        } else {
-                /* put message on sent list */
-                ceph_msg_get(m);
-                list_move_tail(&m->list_head, &con->out_sent);
-        }
-        /*
-         * only assign outgoing seq # if we haven't sent this message
-         * yet.  if it is requeued, resend with it's original seq.
-         */
-        if (m->needs_out_seq) {
-                m->hdr.seq = cpu_to_le64(++con->out_seq);
-                m->needs_out_seq = false;
-        }
-        dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
-             m, con->out_seq, le16_to_cpu(m->hdr.type),
-             le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-             le32_to_cpu(m->hdr.data_len),
-             m->nr_pages);
-        BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
-        /* tag + hdr + front + middle */
-        con->out_kvec[v].iov_base = &tag_msg;
-        con->out_kvec[v++].iov_len = 1;
-        con->out_kvec[v].iov_base = &m->hdr;
-        con->out_kvec[v++].iov_len = sizeof(m->hdr);
-        con->out_kvec[v++] = m->front;
-        if (m->middle)
-                con->out_kvec[v++] = m->middle->vec;
-        con->out_kvec_left = v;
-        con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
-                (m->middle ? m->middle->vec.iov_len : 0);
-        con->out_kvec_cur = con->out_kvec;
-        /* fill in crc (except data pages), footer */
-        con->out_msg->hdr.crc =
-                cpu_to_le32(crc32c(0, (void *)&m->hdr,
-                                      sizeof(m->hdr) - sizeof(m->hdr.crc)));
-        con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
-        con->out_msg->footer.front_crc =
-                cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
-        if (m->middle)
-                con->out_msg->footer.middle_crc =
-                        cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
-                                           m->middle->vec.iov_len));
-        else
-                con->out_msg->footer.middle_crc = 0;
-        con->out_msg->footer.data_crc = 0;
-        dout("prepare_write_message front_crc %u data_crc %u\n",
-             le32_to_cpu(con->out_msg->footer.front_crc),
-             le32_to_cpu(con->out_msg->footer.middle_crc));
-        /* is there a data payload? */
-        if (le32_to_cpu(m->hdr.data_len) > 0) {
-                /* initialize page iterator */
-                con->out_msg_pos.page = 0;
-                con->out_msg_pos.page_pos =
-                        le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
-                con->out_msg_pos.data_pos = 0;
-                con->out_msg_pos.did_page_crc = 0;
-                con->out_more = 1;  /* data + footer will follow */
-        } else {
-                /* no, queue up footer too and be done */
-                prepare_write_message_footer(con, v);
-        }
-        set_bit(WRITE_PENDING, &con->state);
-}
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
-        dout("prepare_write_ack %p %llu -> %llu\n", con,
-             con->in_seq_acked, con->in_seq);
-        con->in_seq_acked = con->in_seq;
-        con->out_kvec[0].iov_base = &tag_ack;
-        con->out_kvec[0].iov_len = 1;
-        con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-        con->out_kvec[1].iov_base = &con->out_temp_ack;
-        con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
-        con->out_kvec_left = 2;
-        con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-        con->out_kvec_cur = con->out_kvec;
-        con->out_more = 1;  /* more will follow.. eventually.. */
-        set_bit(WRITE_PENDING, &con->state);
-}
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
-        dout("prepare_write_keepalive %p\n", con);
-        con->out_kvec[0].iov_base = &tag_keepalive;
-        con->out_kvec[0].iov_len = 1;
-        con->out_kvec_left = 1;
-        con->out_kvec_bytes = 1;
-        con->out_kvec_cur = con->out_kvec;
-        set_bit(WRITE_PENDING, &con->state);
-}
-/*
- * Connection negotiation.
- */
-static void prepare_connect_authorizer(struct ceph_connection *con)
-{
-        void *auth_buf;
-        int auth_len = 0;
-        int auth_protocol = 0;
-        mutex_unlock(&con->mutex);
-        if (con->ops->get_authorizer)
-                con->ops->get_authorizer(con, &auth_buf, &auth_len,
-                                         &auth_protocol, &con->auth_reply_buf,
-                                         &con->auth_reply_buf_len,
-                                         con->auth_retry);
-        mutex_lock(&con->mutex);
-        con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
-        con->out_connect.authorizer_len = cpu_to_le32(auth_len);
-        con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
-        con->out_kvec[con->out_kvec_left].iov_len = auth_len;
-        con->out_kvec_left++;
-        con->out_kvec_bytes += auth_len;
-}
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_messenger *msgr,
-                                 struct ceph_connection *con)
-{
-        int len = strlen(CEPH_BANNER);
-        con->out_kvec[0].iov_base = CEPH_BANNER;
-        con->out_kvec[0].iov_len = len;
-        con->out_kvec[1].iov_base = &msgr->my_enc_addr;
-        con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
-        con->out_kvec_left = 2;
-        con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
-        con->out_kvec_cur = con->out_kvec;
-        con->out_more = 0;
-        set_bit(WRITE_PENDING, &con->state);
-}
-static void prepare_write_connect(struct ceph_messenger *msgr,
-                                  struct ceph_connection *con,
-                                  int after_banner)
-{
-        unsigned global_seq = get_global_seq(con->msgr, 0);
-        int proto;
-        switch (con->peer_name.type) {
-        case CEPH_ENTITY_TYPE_MON:
-                proto = CEPH_MONC_PROTOCOL;
-                break;
-        case CEPH_ENTITY_TYPE_OSD:
-                proto = CEPH_OSDC_PROTOCOL;
-                break;
-        case CEPH_ENTITY_TYPE_MDS:
-                proto = CEPH_MDSC_PROTOCOL;
-                break;
-        default:
-                BUG();
-        }
-        dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-             con->connect_seq, global_seq, proto);
-        con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
-        con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-        con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-        con->out_connect.global_seq = cpu_to_le32(global_seq);
-        con->out_connect.protocol_version = cpu_to_le32(proto);
-        con->out_connect.flags = 0;
-        if (!after_banner) {
-                con->out_kvec_left = 0;
-                con->out_kvec_bytes = 0;
-        }
-        con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
-        con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
-        con->out_kvec_left++;
-        con->out_kvec_bytes += sizeof(con->out_connect);
-        con->out_kvec_cur = con->out_kvec;
-        con->out_more = 0;
-        set_bit(WRITE_PENDING, &con->state);
-        prepare_connect_authorizer(con);
-}
-/*
- * write as much of pending kvecs to the socket as we can.
- *  1 -> done
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
-        int ret;
-        dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-        while (con->out_kvec_bytes > 0) {
-                ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-                                       con->out_kvec_left, con->out_kvec_bytes,
-                                       con->out_more);
-                if (ret <= 0)
-                        goto out;
-                con->out_kvec_bytes -= ret;
-                if (con->out_kvec_bytes == 0)
-                        break;            /* done */
-                while (ret > 0) {
-                        if (ret >= con->out_kvec_cur->iov_len) {
-                                ret -= con->out_kvec_cur->iov_len;
-                                con->out_kvec_cur++;
-                                con->out_kvec_left--;
-                        } else {
-                                con->out_kvec_cur->iov_len -= ret;
-                                con->out_kvec_cur->iov_base += ret;
-                                ret = 0;
-                                break;
-                        }
-                }
-        }
-        con->out_kvec_left = 0;
-        con->out_kvec_is_msg = false;
-        ret = 1;
-out:
-        dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-             con->out_kvec_bytes, con->out_kvec_left, ret);
-        return ret;  /* done! */
-}
-/*
- * Write as much message data payload as we can.  If we finish, queue
- * up the footer.
- *  1 -> done, footer is now queued in out_kvec[].
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_msg_pages(struct ceph_connection *con)
-{
-        struct ceph_msg *msg = con->out_msg;
-        unsigned data_len = le32_to_cpu(msg->hdr.data_len);
-        size_t len;
-        int crc = con->msgr->nocrc;
-        int ret;
-        dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
-             con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
-             con->out_msg_pos.page_pos);
-        while (con->out_msg_pos.page < con->out_msg->nr_pages) {
-                struct page *page = NULL;
-                void *kaddr = NULL;
-                /*
-                 * if we are calculating the data crc (the default), we need
-                 * to map the page.  if our pages[] has been revoked, use the
-                 * zero page.
-                 */
-                if (msg->pages) {
-                        page = msg->pages[con->out_msg_pos.page];
-                        if (crc)
-                                kaddr = kmap(page);
-                } else if (msg->pagelist) {
-                        page = list_first_entry(&msg->pagelist->head,
-                                                struct page, lru);
-                        if (crc)
-                                kaddr = kmap(page);
-                } else {
-                        page = con->msgr->zero_page;
-                        if (crc)
-                                kaddr = page_address(con->msgr->zero_page);
-                }
-                len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
-                          (int)(data_len - con->out_msg_pos.data_pos));
-                if (crc && !con->out_msg_pos.did_page_crc) {
-                        void *base = kaddr + con->out_msg_pos.page_pos;
-                        u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
-                        BUG_ON(kaddr == NULL);
-                        con->out_msg->footer.data_crc =
-                                cpu_to_le32(crc32c(tmpcrc, base, len));
-                        con->out_msg_pos.did_page_crc = 1;
-                }
-                ret = kernel_sendpage(con->sock, page,
-                                      con->out_msg_pos.page_pos, len,
-                                      MSG_DONTWAIT | MSG_NOSIGNAL |
-                                      MSG_MORE);
-                if (crc && (msg->pages || msg->pagelist))
-                        kunmap(page);
-                if (ret <= 0)
-                        goto out;
-                con->out_msg_pos.data_pos += ret;
-                con->out_msg_pos.page_pos += ret;
-                if (ret == len) {
-                        con->out_msg_pos.page_pos = 0;
-                        con->out_msg_pos.page++;
-                        con->out_msg_pos.did_page_crc = 0;
-                        if (msg->pagelist)
-                                list_move_tail(&page->lru,
-                                               &msg->pagelist->head);
-                }
-        }
-        dout("write_partial_msg_pages %p msg %p done\n", con, msg);
-        /* prepare and queue up footer, too */
-        if (!crc)
-                con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-        con->out_kvec_bytes = 0;
-        con->out_kvec_left = 0;
-        con->out_kvec_cur = con->out_kvec;
-        prepare_write_message_footer(con, 0);
-        ret = 1;
-out:
-        return ret;
-}
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
-        int ret;
-        while (con->out_skip > 0) {
-                struct kvec iov = {
-                        .iov_base = page_address(con->msgr->zero_page),
-                        .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
-                };
-                ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
-                if (ret <= 0)
-                        goto out;
-                con->out_skip -= ret;
-        }
-        ret = 1;
-out:
-        return ret;
-}
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
-        dout("prepare_read_banner %p\n", con);
-        con->in_base_pos = 0;
-}
-static void prepare_read_connect(struct ceph_connection *con)
-{
-        dout("prepare_read_connect %p\n", con);
-        con->in_base_pos = 0;
-}
-static void prepare_read_ack(struct ceph_connection *con)
-{
-        dout("prepare_read_ack %p\n", con);
-        con->in_base_pos = 0;
-}
-static void prepare_read_tag(struct ceph_connection *con)
-{
-        dout("prepare_read_tag %p\n", con);
-        con->in_base_pos = 0;
-        con->in_tag = CEPH_MSGR_TAG_READY;
-}
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
-        dout("prepare_read_message %p\n", con);
-        BUG_ON(con->in_msg != NULL);
-        con->in_base_pos = 0;
-        con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
-        return 0;
-}
-static int read_partial(struct ceph_connection *con,
-                        int *to, int size, void *object)
-{
-        *to += size;
-        while (con->in_base_pos < *to) {
-                int left = *to - con->in_base_pos;
-                int have = size - left;
-                int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
-                if (ret <= 0)
-                        return ret;
-                con->in_base_pos += ret;
-        }
-        return 1;
-}
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
-        int ret, to = 0;
-        dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-        /* peer's banner */
-        ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
-        if (ret <= 0)
-                goto out;
-        ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
-                           &con->actual_peer_addr);
-        if (ret <= 0)
-                goto out;
-        ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
-                           &con->peer_addr_for_me);
-        if (ret <= 0)
-                goto out;
-out:
-        return ret;
-}
-static int read_partial_connect(struct ceph_connection *con)
-{
-        int ret, to = 0;
-        dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-        ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
-        if (ret <= 0)
-                goto out;
-        ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
-                           con->auth_reply_buf);
-        if (ret <= 0)
-                goto out;
-        dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-             con, (int)con->in_reply.tag,
-             le32_to_cpu(con->in_reply.connect_seq),
-             le32_to_cpu(con->in_reply.global_seq));
-out:
-        return ret;
-}
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
-        if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
-                pr_err("connect to %s got bad banner\n",
-                       pr_addr(&con->peer_addr.in_addr));
-                con->error_msg = "protocol error, bad banner";
-                return -1;
-        }
-        return 0;
-}
-static bool addr_is_blank(struct sockaddr_storage *ss)
-{
-        switch (ss->ss_family) {
-        case AF_INET:
-                return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
-        case AF_INET6:
-                return
-                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
-                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
-                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
-                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
-        }
-        return false;
-}
-static int addr_port(struct sockaddr_storage *ss)
-{
-        switch (ss->ss_family) {
-        case AF_INET:
-                return ntohs(((struct sockaddr_in *)ss)->sin_port);
-        case AF_INET6:
-                return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
-        }
-        return 0;
-}
-static void addr_set_port(struct sockaddr_storage *ss, int p)
-{
-        switch (ss->ss_family) {
-        case AF_INET:
-                ((struct sockaddr_in *)ss)->sin_port = htons(p);
-        case AF_INET6:
-                ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
-        }
-}
-/*
- * Parse an ip[:port] list into an addr array.  Use the default
- * monitor port if a port isn't specified.
- */
-int ceph_parse_ips(const char *c, const char *end,
-                   struct ceph_entity_addr *addr,
-                   int max_count, int *count)
-{
-        int i;
-        const char *p = c;
-        dout("parse_ips on '%.*s'\n", (int)(end-c), c);
-        for (i = 0; i < max_count; i++) {
-                const char *ipend;
-                struct sockaddr_storage *ss = &addr[i].in_addr;
-                struct sockaddr_in *in4 = (void *)ss;
-                struct sockaddr_in6 *in6 = (void *)ss;
-                int port;
-                char delim = ',';
-                if (*p == '[') {
-                        delim = ']';
-                        p++;
-                }
-                memset(ss, 0, sizeof(*ss));
-                if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
-                             delim, &ipend))
-                        ss->ss_family = AF_INET;
-                else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
-                                  delim, &ipend))
-                        ss->ss_family = AF_INET6;
-                else
-                        goto bad;
-                p = ipend;
-                if (delim == ']') {
-                        if (*p != ']') {
-                                dout("missing matching ']'\n");
-                                goto bad;
-                        }
-                        p++;
-                }
-                /* port? */
-                if (p < end && *p == ':') {
-                        port = 0;
-                        p++;
-                        while (p < end && *p >= '0' && *p <= '9') {
-                                port = (port * 10) + (*p - '0');
-                                p++;
-                        }
-                        if (port > 65535 || port == 0)
-                                goto bad;
-                } else {
-                        port = CEPH_MON_PORT;
-                }
-                addr_set_port(ss, port);
-                dout("parse_ips got %s\n", pr_addr(ss));
-                if (p == end)
-                        break;
-                if (*p != ',')
-                        goto bad;
-                p++;
-        }
-        if (p != end)
-                goto bad;
-        if (count)
-                *count = i + 1;
-        return 0;
-bad:
-        pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
-        return -EINVAL;
-}
-static int process_banner(struct ceph_connection *con)
-{
-        dout("process_banner on %p\n", con);
-        if (verify_hello(con) < 0)
-                return -1;
-        ceph_decode_addr(&con->actual_peer_addr);
-        ceph_decode_addr(&con->peer_addr_for_me);
-        /*
-         * Make sure the other end is who we wanted.  note that the other
-         * end may not yet know their ip address, so if it's 0.0.0.0, give
-         * them the benefit of the doubt.
-         */
-        if (memcmp(&con->peer_addr, &con->actual_peer_addr,
-                   sizeof(con->peer_addr)) != 0 &&
-            !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
-              con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-                pr_warning("wrong peer, want %s/%d, got %s/%d\n",
-                           pr_addr(&con->peer_addr.in_addr),
-                           (int)le32_to_cpu(con->peer_addr.nonce),
-                           pr_addr(&con->actual_peer_addr.in_addr),
-                           (int)le32_to_cpu(con->actual_peer_addr.nonce));
-                con->error_msg = "wrong peer at address";
-                return -1;
-        }
-        /*
-         * did we learn our address?
-         */
-        if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
-                int port = addr_port(&con->msgr->inst.addr.in_addr);
-                memcpy(&con->msgr->inst.addr.in_addr,
-                       &con->peer_addr_for_me.in_addr,
-                       sizeof(con->peer_addr_for_me.in_addr));
-                addr_set_port(&con->msgr->inst.addr.in_addr, port);
-                encode_my_addr(con->msgr);
-                dout("process_banner learned my addr is %s\n",
-                     pr_addr(&con->msgr->inst.addr.in_addr));
-        }
-        set_bit(NEGOTIATING, &con->state);
-        prepare_read_connect(con);
-        return 0;
-}
-static void fail_protocol(struct ceph_connection *con)
-{
-        reset_connection(con);
-        set_bit(CLOSED, &con->state);  /* in case there's queued work */
-        mutex_unlock(&con->mutex);
-        if (con->ops->bad_proto)
-                con->ops->bad_proto(con);
-        mutex_lock(&con->mutex);
-}
-static int process_connect(struct ceph_connection *con)
-{
-        u64 sup_feat = CEPH_FEATURE_SUPPORTED;
-        u64 req_feat = CEPH_FEATURE_REQUIRED;
-        u64 server_feat = le64_to_cpu(con->in_reply.features);
-        dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-        switch (con->in_reply.tag) {
-        case CEPH_MSGR_TAG_FEATURES:
-                pr_err("%s%lld %s feature set mismatch,"
-                       " my %llx < server's %llx, missing %llx\n",
-                       ENTITY_NAME(con->peer_name),
-                       pr_addr(&con->peer_addr.in_addr),
-                       sup_feat, server_feat, server_feat & ~sup_feat);
-                con->error_msg = "missing required protocol features";
-                fail_protocol(con);
-                return -1;
-        case CEPH_MSGR_TAG_BADPROTOVER:
-                pr_err("%s%lld %s protocol version mismatch,"
-                       " my %d != server's %d\n",
-                       ENTITY_NAME(con->peer_name),
-                       pr_addr(&con->peer_addr.in_addr),
-                       le32_to_cpu(con->out_connect.protocol_version),
-                       le32_to_cpu(con->in_reply.protocol_version));
-                con->error_msg = "protocol version mismatch";
-                fail_protocol(con);
-                return -1;
-        case CEPH_MSGR_TAG_BADAUTHORIZER:
-                con->auth_retry++;
-                dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-                     con->auth_retry);
-                if (con->auth_retry == 2) {
-                        con->error_msg = "connect authorization failure";
-                        reset_connection(con);
-                        set_bit(CLOSED, &con->state);
-                        return -1;
-                }
-                con->auth_retry = 1;
-                prepare_write_connect(con->msgr, con, 0);
-                prepare_read_connect(con);
-                break;
-        case CEPH_MSGR_TAG_RESETSESSION:
-                /*
-                 * If we connected with a large connect_seq but the peer
-                 * has no record of a session with us (no connection, or
-                 * connect_seq == 0), they will send RESETSESION to indicate
-                 * that they must have reset their session, and may have
-                 * dropped messages.
-                 */
-                dout("process_connect got RESET peer seq %u\n",
-                     le32_to_cpu(con->in_connect.connect_seq));
-                pr_err("%s%lld %s connection reset\n",
-                       ENTITY_NAME(con->peer_name),
-                       pr_addr(&con->peer_addr.in_addr));
-                reset_connection(con);
-                prepare_write_connect(con->msgr, con, 0);
-                prepare_read_connect(con);
-                /* Tell ceph about it. */
-                mutex_unlock(&con->mutex);
-                pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
-                if (con->ops->peer_reset)
-                        con->ops->peer_reset(con);
-                mutex_lock(&con->mutex);
-                break;
-        case CEPH_MSGR_TAG_RETRY_SESSION:
-                /*
-                 * If we sent a smaller connect_seq than the peer has, try
-                 * again with a larger value.
-                 */
-                dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
-                     le32_to_cpu(con->out_connect.connect_seq),
-                     le32_to_cpu(con->in_connect.connect_seq));
-                con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
-                prepare_write_connect(con->msgr, con, 0);
-                prepare_read_connect(con);
-                break;
-        case CEPH_MSGR_TAG_RETRY_GLOBAL:
-                /*
-                 * If we sent a smaller global_seq than the peer has, try
-                 * again with a larger value.
-                 */
-                dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-                     con->peer_global_seq,
-                     le32_to_cpu(con->in_connect.global_seq));
-                get_global_seq(con->msgr,
-                               le32_to_cpu(con->in_connect.global_seq));
-                prepare_write_connect(con->msgr, con, 0);
-                prepare_read_connect(con);
-                break;
-        case CEPH_MSGR_TAG_READY:
-                if (req_feat & ~server_feat) {
-                        pr_err("%s%lld %s protocol feature mismatch,"
-                               " my required %llx > server's %llx, need %llx\n",
-                               ENTITY_NAME(con->peer_name),
-                               pr_addr(&con->peer_addr.in_addr),
-                               req_feat, server_feat, req_feat & ~server_feat);
-                        con->error_msg = "missing required protocol features";
-                        fail_protocol(con);
-                        return -1;
-                }
-                clear_bit(CONNECTING, &con->state);
-                con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-                con->connect_seq++;
-                con->peer_features = server_feat;
-                dout("process_connect got READY gseq %d cseq %d (%d)\n",
-                     con->peer_global_seq,
-                     le32_to_cpu(con->in_reply.connect_seq),
-                     con->connect_seq);
-                WARN_ON(con->connect_seq !=
-                        le32_to_cpu(con->in_reply.connect_seq));
-                if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-                        set_bit(LOSSYTX, &con->state);
-                prepare_read_tag(con);
-                break;
-        case CEPH_MSGR_TAG_WAIT:
-                /*
-                 * If there is a connection race (we are opening
-                 * connections to each other), one of us may just have
-                 * to WAIT.  This shouldn't happen if we are the
-                 * client.
-                 */
-                pr_err("process_connect peer connecting WAIT\n");
-        default:
-                pr_err("connect protocol error, will retry\n");
-                con->error_msg = "protocol error, garbage tag during connect";
-                return -1;
-        }
-        return 0;
-}
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
-        int to = 0;
-        return read_partial(con, &to, sizeof(con->in_temp_ack),
-                            &con->in_temp_ack);
-}
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
-        struct ceph_msg *m;
-        u64 ack = le64_to_cpu(con->in_temp_ack);
-        u64 seq;
-        while (!list_empty(&con->out_sent)) {
-                m = list_first_entry(&con->out_sent, struct ceph_msg,
-                                     list_head);
-                seq = le64_to_cpu(m->hdr.seq);
-                if (seq > ack)
-                        break;
-                dout("got ack for seq %llu type %d at %p\n", seq,
-                     le16_to_cpu(m->hdr.type), m);
-                ceph_msg_remove(m);
-        }
-        prepare_read_tag(con);
-}
-static int read_partial_message_section(struct ceph_connection *con,
-                                        struct kvec *section,
-                                        unsigned int sec_len, u32 *crc)
-{
-        int left;
-        int ret;
-        BUG_ON(!section);
-        while (section->iov_len < sec_len) {
-                BUG_ON(section->iov_base == NULL);
-                left = sec_len - section->iov_len;
-                ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
-                                       section->iov_len, left);
-                if (ret <= 0)
-                        return ret;
-                section->iov_len += ret;
-                if (section->iov_len == sec_len)
-                        *crc = crc32c(0, section->iov_base,
-                                      section->iov_len);
-        }
-        return 1;
-}
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-                                struct ceph_msg_header *hdr,
-                                int *skip);
-/*
- * read (part of) a message.
- */
-static int read_partial_message(struct ceph_connection *con)
-{
-        struct ceph_msg *m = con->in_msg;
-        void *p;
-        int ret;
-        int to, left;
-        unsigned front_len, middle_len, data_len, data_off;
-        int datacrc = con->msgr->nocrc;
-        int skip;
-        u64 seq;
-        dout("read_partial_message con %p msg %p\n", con, m);
-        /* header */
-        while (con->in_base_pos < sizeof(con->in_hdr)) {
-                left = sizeof(con->in_hdr) - con->in_base_pos;
-                ret = ceph_tcp_recvmsg(con->sock,
-                                       (char *)&con->in_hdr + con->in_base_pos,
-                                       left);
-                if (ret <= 0)
-                        return ret;
-                con->in_base_pos += ret;
-                if (con->in_base_pos == sizeof(con->in_hdr)) {
-                        u32 crc = crc32c(0, (void *)&con->in_hdr,
-                                 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
-                        if (crc != le32_to_cpu(con->in_hdr.crc)) {
-                                pr_err("read_partial_message bad hdr "
-                                       " crc %u != expected %u\n",
-                                       crc, con->in_hdr.crc);
-                                return -EBADMSG;
-                        }
-                }
-        }
-        front_len = le32_to_cpu(con->in_hdr.front_len);
-        if (front_len > CEPH_MSG_MAX_FRONT_LEN)
-                return -EIO;
-        middle_len = le32_to_cpu(con->in_hdr.middle_len);
-        if (middle_len > CEPH_MSG_MAX_DATA_LEN)
-                return -EIO;
-        data_len = le32_to_cpu(con->in_hdr.data_len);
-        if (data_len > CEPH_MSG_MAX_DATA_LEN)
-                return -EIO;
-        data_off = le16_to_cpu(con->in_hdr.data_off);
-        /* verify seq# */
-        seq = le64_to_cpu(con->in_hdr.seq);
-        if ((s64)seq - (s64)con->in_seq < 1) {
-                pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
-                        ENTITY_NAME(con->peer_name),
-                        pr_addr(&con->peer_addr.in_addr),
-                        seq, con->in_seq + 1);
-                con->in_base_pos = -front_len - middle_len - data_len -
-                        sizeof(m->footer);
-                con->in_tag = CEPH_MSGR_TAG_READY;
-                con->in_seq++;
-                return 0;
-        } else if ((s64)seq - (s64)con->in_seq > 1) {
-                pr_err("read_partial_message bad seq %lld expected %lld\n",
-                       seq, con->in_seq + 1);
-                con->error_msg = "bad message sequence # for incoming message";
-                return -EBADMSG;
-        }
-        /* allocate message? */
-        if (!con->in_msg) {
-                dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
-                     con->in_hdr.front_len, con->in_hdr.data_len);
-                skip = 0;
-                con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
-                if (skip) {
-                        /* skip this message */
-                        dout("alloc_msg said skip message\n");
-                        BUG_ON(con->in_msg);
-                        con->in_base_pos = -front_len - middle_len - data_len -
-                                sizeof(m->footer);
-                        con->in_tag = CEPH_MSGR_TAG_READY;
-                        con->in_seq++;
-                        return 0;
-                }
-                if (!con->in_msg) {
-                        con->error_msg =
-                                "error allocating memory for incoming message";
-                        return -ENOMEM;
-                }
-                m = con->in_msg;
-                m->front.iov_len = 0;    /* haven't read it yet */
-                if (m->middle)
-                        m->middle->vec.iov_len = 0;
-                con->in_msg_pos.page = 0;
-                con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
-                con->in_msg_pos.data_pos = 0;
-        }
-        /* front */
-        ret = read_partial_message_section(con, &m->front, front_len,
-                                           &con->in_front_crc);
-        if (ret <= 0)
-                return ret;
-        /* middle */
-        if (m->middle) {
-                ret = read_partial_message_section(con, &m->middle->vec,
-                                                   middle_len,
-                                                   &con->in_middle_crc);
-                if (ret <= 0)
-                        return ret;
-        }
-        /* (page) data */
-        while (con->in_msg_pos.data_pos < data_len) {
-                left = min((int)(data_len - con->in_msg_pos.data_pos),
-                           (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
-                BUG_ON(m->pages == NULL);
-                p = kmap(m->pages[con->in_msg_pos.page]);
-                ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-                                       left);
-                if (ret > 0 && datacrc)
-                        con->in_data_crc =
-                                crc32c(con->in_data_crc,
-                                          p + con->in_msg_pos.page_pos, ret);
-                kunmap(m->pages[con->in_msg_pos.page]);
-                if (ret <= 0)
-                        return ret;
-                con->in_msg_pos.data_pos += ret;
-                con->in_msg_pos.page_pos += ret;
-                if (con->in_msg_pos.page_pos == PAGE_SIZE) {
-                        con->in_msg_pos.page_pos = 0;
-                        con->in_msg_pos.page++;
-                }
-        }
-        /* footer */
-        to = sizeof(m->hdr) + sizeof(m->footer);
-        while (con->in_base_pos < to) {
-                left = to - con->in_base_pos;
-                ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
-                                       (con->in_base_pos - sizeof(m->hdr)),
-                                       left);
-                if (ret <= 0)
-                        return ret;
-                con->in_base_pos += ret;
-        }
-        dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
-             m, front_len, m->footer.front_crc, middle_len,
-             m->footer.middle_crc, data_len, m->footer.data_crc);
-        /* crc ok? */
-        if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
-                pr_err("read_partial_message %p front crc %u != exp. %u\n",
-                       m, con->in_front_crc, m->footer.front_crc);
-                return -EBADMSG;
-        }
-        if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
-                pr_err("read_partial_message %p middle crc %u != exp %u\n",
-                       m, con->in_middle_crc, m->footer.middle_crc);
-                return -EBADMSG;
-        }
-        if (datacrc &&
-            (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
-            con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
-                pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
-                       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
-                return -EBADMSG;
-        }
-        return 1; /* done! */
-}
-/*
- * Process message.  This happens in the worker thread.  The callback should
- * be careful not to do anything that waits on other incoming messages or it
- * may deadlock.
- */
-static void process_message(struct ceph_connection *con)
-{
-        struct ceph_msg *msg;
-        msg = con->in_msg;
-        con->in_msg = NULL;
-        /* if first message, set peer_name */
-        if (con->peer_name.type == 0)
-                con->peer_name = msg->hdr.src;
-        con->in_seq++;
-        mutex_unlock(&con->mutex);
-        dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
-             msg, le64_to_cpu(msg->hdr.seq),
-             ENTITY_NAME(msg->hdr.src),
-             le16_to_cpu(msg->hdr.type),
-             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
-             le32_to_cpu(msg->hdr.front_len),
-             le32_to_cpu(msg->hdr.data_len),
-             con->in_front_crc, con->in_middle_crc, con->in_data_crc);
-        con->ops->dispatch(con, msg);
-        mutex_lock(&con->mutex);
-        prepare_read_tag(con);
-}
-/*
- * Write something to the socket.  Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
-        struct ceph_messenger *msgr = con->msgr;
-        int ret = 1;
-        dout("try_write start %p state %lu nref %d\n", con, con->state,
-             atomic_read(&con->nref));
-more:
-        dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-        /* open the socket first? */
-        if (con->sock == NULL) {
-                /*
-                 * if we were STANDBY and are reconnecting _this_
-                 * connection, bump connect_seq now.  Always bump
-                 * global_seq.
-                 */
-                if (test_and_clear_bit(STANDBY, &con->state))
-                        con->connect_seq++;
-                prepare_write_banner(msgr, con);
-                prepare_write_connect(msgr, con, 1);
-                prepare_read_banner(con);
-                set_bit(CONNECTING, &con->state);
-                clear_bit(NEGOTIATING, &con->state);
-                BUG_ON(con->in_msg);
-                con->in_tag = CEPH_MSGR_TAG_READY;
-                dout("try_write initiating connect on %p new state %lu\n",
-                     con, con->state);
-                con->sock = ceph_tcp_connect(con);
-                if (IS_ERR(con->sock)) {
-                        con->sock = NULL;
-                        con->error_msg = "connect error";
-                        ret = -1;
-                        goto out;
-                }
-        }
-more_kvec:
-        /* kvec data queued? */
-        if (con->out_skip) {
-                ret = write_partial_skip(con);
-                if (ret <= 0)
-                        goto done;
-                if (ret < 0) {
-                        dout("try_write write_partial_skip err %d\n", ret);
-                        goto done;
-                }
-        }
-        if (con->out_kvec_left) {
-                ret = write_partial_kvec(con);
-                if (ret <= 0)
-                        goto done;
-        }
-        /* msg pages? */
-        if (con->out_msg) {
-                if (con->out_msg_done) {
-                        ceph_msg_put(con->out_msg);
-                        con->out_msg = NULL;   /* we're done with this one */
-                        goto do_next;
-                }
-                ret = write_partial_msg_pages(con);
-                if (ret == 1)
-                        goto more_kvec;  /* we need to send the footer, too! */
-                if (ret == 0)
-                        goto done;
-                if (ret < 0) {
-                        dout("try_write write_partial_msg_pages err %d\n",
-                             ret);
-                        goto done;
-                }
-        }
-do_next:
-        if (!test_bit(CONNECTING, &con->state)) {
-                /* is anything else pending? */
-                if (!list_empty(&con->out_queue)) {
-                        prepare_write_message(con);
-                        goto more;
-                }
-                if (con->in_seq > con->in_seq_acked) {
-                        prepare_write_ack(con);
-                        goto more;
-                }
-                if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
-                        prepare_write_keepalive(con);
-                        goto more;
-                }
-        }
-        /* Nothing to do! */
-        clear_bit(WRITE_PENDING, &con->state);
-        dout("try_write nothing else to write.\n");
-done:
-        ret = 0;
-out:
-        dout("try_write done on %p\n", con);
-        return ret;
-}
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
-        int ret = -1;
-        if (!con->sock)
-                return 0;
-        if (test_bit(STANDBY, &con->state))
-                return 0;
-        dout("try_read start on %p\n", con);
-more:
-        dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-             con->in_base_pos);
-        if (test_bit(CONNECTING, &con->state)) {
-                if (!test_bit(NEGOTIATING, &con->state)) {
-                        dout("try_read connecting\n");
-                        ret = read_partial_banner(con);
-                        if (ret <= 0)
-                                goto done;
-                        if (process_banner(con) < 0) {
-                                ret = -1;
-                                goto out;
-                        }
-                }
-                ret = read_partial_connect(con);
-                if (ret <= 0)
-                        goto done;
-                if (process_connect(con) < 0) {
-                        ret = -1;
-                        goto out;
-                }
-                goto more;
-        }
-        if (con->in_base_pos < 0) {
-                /*
-                 * skipping + discarding content.
-                 *
-                 * FIXME: there must be a better way to do this!
-                 */
-                static char buf[1024];
-                int skip = min(1024, -con->in_base_pos);
-                dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
-                ret = ceph_tcp_recvmsg(con->sock, buf, skip);
-                if (ret <= 0)
-                        goto done;
-                con->in_base_pos += ret;
-                if (con->in_base_pos)
-                        goto more;
-        }
-        if (con->in_tag == CEPH_MSGR_TAG_READY) {
-                /*
-                 * what's next?
-                 */
-                ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
-                if (ret <= 0)
-                        goto done;
-                dout("try_read got tag %d\n", (int)con->in_tag);
-                switch (con->in_tag) {
-                case CEPH_MSGR_TAG_MSG:
-                        prepare_read_message(con);
-                        break;
-                case CEPH_MSGR_TAG_ACK:
-                        prepare_read_ack(con);
-                        break;
-                case CEPH_MSGR_TAG_CLOSE:
-                        set_bit(CLOSED, &con->state);   /* fixme */
-                        goto done;
-                default:
-                        goto bad_tag;
-                }
-        }
-        if (con->in_tag == CEPH_MSGR_TAG_MSG) {
-                ret = read_partial_message(con);
-                if (ret <= 0) {
-                        switch (ret) {
-                        case -EBADMSG:
-                                con->error_msg = "bad crc";
-                                ret = -EIO;
-                                goto out;
-                        case -EIO:
-                                con->error_msg = "io error";
-                                goto out;
-                        default:
-                                goto done;
-                        }
-                }
-                if (con->in_tag == CEPH_MSGR_TAG_READY)
-                        goto more;
-                process_message(con);
-                goto more;
-        }
-        if (con->in_tag == CEPH_MSGR_TAG_ACK) {
-                ret = read_partial_ack(con);
-                if (ret <= 0)
-                        goto done;
-                process_ack(con);
-                goto more;
-        }
-done:
-        ret = 0;
-out:
-        dout("try_read done on %p\n", con);
-        return ret;
-bad_tag:
-        pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
-        con->error_msg = "protocol error, garbage tag";
-        ret = -1;
-        goto out;
-}
-/*
- * Atomically queue work on a connection.  Bump @con reference to
- * avoid races with connection teardown.
- *
- * There is some trickery going on with QUEUED and BUSY because we
- * only want a _single_ thread operating on each connection at any
- * point in time, but we want to use all available CPUs.
- *
- * The worker thread only proceeds if it can atomically set BUSY.  It
- * clears QUEUED and does it's thing.  When it thinks it's done, it
- * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
- * (tries again to set BUSY).
- *
- * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
- * try to queue work.  If that fails (work is already queued, or BUSY)
- * we give up (work also already being done or is queued) but leave QUEUED
- * set so that the worker thread will loop if necessary.
- */
-static void queue_con(struct ceph_connection *con)
-{
-        if (test_bit(DEAD, &con->state)) {
-                dout("queue_con %p ignoring: DEAD\n",
-                     con);
-                return;
-        }
-        if (!con->ops->get(con)) {
-                dout("queue_con %p ref count 0\n", con);
-                return;
-        }
-        set_bit(QUEUED, &con->state);
-        if (test_bit(BUSY, &con->state)) {
-                dout("queue_con %p - already BUSY\n", con);
-                con->ops->put(con);
-        } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
-                dout("queue_con %p - already queued\n", con);
-                con->ops->put(con);
-        } else {
-                dout("queue_con %p\n", con);
-        }
-}
-/*
- * Do some work on a connection.  Drop a connection ref when we're done.
- */
-static void con_work(struct work_struct *work)
-{
-        struct ceph_connection *con = container_of(work, struct ceph_connection,
-                                                   work.work);
-        int backoff = 0;
-more:
-        if (test_and_set_bit(BUSY, &con->state) != 0) {
-                dout("con_work %p BUSY already set\n", con);
-                goto out;
-        }
-        dout("con_work %p start, clearing QUEUED\n", con);
-        clear_bit(QUEUED, &con->state);
-        mutex_lock(&con->mutex);
-        if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
-                dout("con_work CLOSED\n");
-                con_close_socket(con);
-                goto done;
-        }
-        if (test_and_clear_bit(OPENING, &con->state)) {
-                /* reopen w/ new peer */
-                dout("con_work OPENING\n");
-                con_close_socket(con);
-        }
-        if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
-            try_read(con) < 0 ||
-            try_write(con) < 0) {
-                mutex_unlock(&con->mutex);
-                backoff = 1;
-                ceph_fault(con);     /* error/fault path */
-                goto done_unlocked;
-        }
-done:
-        mutex_unlock(&con->mutex);
-done_unlocked:
-        clear_bit(BUSY, &con->state);
-        dout("con->state=%lu\n", con->state);
-        if (test_bit(QUEUED, &con->state)) {
-                if (!backoff || test_bit(OPENING, &con->state)) {
-                        dout("con_work %p QUEUED reset, looping\n", con);
-                        goto more;
-                }
-                dout("con_work %p QUEUED reset, but just faulted\n", con);
-                clear_bit(QUEUED, &con->state);
-        }
-        dout("con_work %p done\n", con);
-out:
-        con->ops->put(con);
-}
-/*
- * Generic error/fault handler.  A retry mechanism is used with
- * exponential backoff
- */
-static void ceph_fault(struct ceph_connection *con)
-{
-        pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-               pr_addr(&con->peer_addr.in_addr), con->error_msg);
-        dout("fault %p state %lu to peer %s\n",
-             con, con->state, pr_addr(&con->peer_addr.in_addr));
-        if (test_bit(LOSSYTX, &con->state)) {
-                dout("fault on LOSSYTX channel\n");
-                goto out;
-        }
-        mutex_lock(&con->mutex);
-        if (test_bit(CLOSED, &con->state))
-                goto out_unlock;
-        con_close_socket(con);
-        if (con->in_msg) {
-                ceph_msg_put(con->in_msg);
-                con->in_msg = NULL;
-        }
-        /* Requeue anything that hasn't been acked */
-        list_splice_init(&con->out_sent, &con->out_queue);
-        /* If there are no messages in the queue, place the connection
-         * in a STANDBY state (i.e., don't try to reconnect just yet). */
-        if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
-                dout("fault setting STANDBY\n");
-                set_bit(STANDBY, &con->state);
-        } else {
-                /* retry after a delay. */
-                if (con->delay == 0)
-                        con->delay = BASE_DELAY_INTERVAL;
-                else if (con->delay < MAX_DELAY_INTERVAL)
-                        con->delay *= 2;
-                dout("fault queueing %p delay %lu\n", con, con->delay);
-                con->ops->get(con);
-                if (queue_delayed_work(ceph_msgr_wq, &con->work,
-                                       round_jiffies_relative(con->delay)) == 0)
-                        con->ops->put(con);
-        }
-out_unlock:
-        mutex_unlock(&con->mutex);
-out:
-        /*
-         * in case we faulted due to authentication, invalidate our
-         * current tickets so that we can get new ones.
-         */
-        if (con->auth_retry && con->ops->invalidate_authorizer) {
-                dout("calling invalidate_authorizer()\n");
-                con->ops->invalidate_authorizer(con);
-        }
-        if (con->ops->fault)
-                con->ops->fault(con);
-}
-/*
- * create a new messenger instance
- */
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
-{
-        struct ceph_messenger *msgr;
-        msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
-        if (msgr == NULL)
-                return ERR_PTR(-ENOMEM);
-        spin_lock_init(&msgr->global_seq_lock);
-        /* the zero page is needed if a request is "canceled" while the message
-         * is being written over the socket */
-        msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
-        if (!msgr->zero_page) {
-                kfree(msgr);
-                return ERR_PTR(-ENOMEM);
-        }
-        kmap(msgr->zero_page);
-        if (myaddr)
-                msgr->inst.addr = *myaddr;
-        /* select a random nonce */
-        msgr->inst.addr.type = 0;
-        get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
-        encode_my_addr(msgr);
-        dout("messenger_create %p\n", msgr);
-        return msgr;
-}
-void ceph_messenger_destroy(struct ceph_messenger *msgr)
-{
-        dout("destroy %p\n", msgr);
-        kunmap(msgr->zero_page);
-        __free_page(msgr->zero_page);
-        kfree(msgr);
-        dout("destroyed messenger %p\n", msgr);
-}
-/*
- * Queue up an outgoing message on the given connection.
- */
-void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        if (test_bit(CLOSED, &con->state)) {
-                dout("con_send %p closed, dropping %p\n", con, msg);
-                ceph_msg_put(msg);
-                return;
-        }
-        /* set src+dst */
-        msg->hdr.src = con->msgr->inst.name;
-        BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
-        msg->needs_out_seq = true;
-        /* queue */
-        mutex_lock(&con->mutex);
-        BUG_ON(!list_empty(&msg->list_head));
-        list_add_tail(&msg->list_head, &con->out_queue);
-        dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
-             ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
-             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
-             le32_to_cpu(msg->hdr.front_len),
-             le32_to_cpu(msg->hdr.middle_len),
-             le32_to_cpu(msg->hdr.data_len));
-        mutex_unlock(&con->mutex);
-        /* if there wasn't anything waiting to send before, queue
-         * new work */
-        if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
-                queue_con(con);
-}
-/*
- * Revoke a message that was previously queued for send
- */
-void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        mutex_lock(&con->mutex);
-        if (!list_empty(&msg->list_head)) {
-                dout("con_revoke %p msg %p - was on queue\n", con, msg);
-                list_del_init(&msg->list_head);
-                ceph_msg_put(msg);
-                msg->hdr.seq = 0;
-        }
-        if (con->out_msg == msg) {
-                dout("con_revoke %p msg %p - was sending\n", con, msg);
-                con->out_msg = NULL;
-                if (con->out_kvec_is_msg) {
-                        con->out_skip = con->out_kvec_bytes;
-                        con->out_kvec_is_msg = false;
-                }
-                ceph_msg_put(msg);
-                msg->hdr.seq = 0;
-        }
-        mutex_unlock(&con->mutex);
-}
-/*
- * Revoke a message that we may be reading data into
- */
-void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        mutex_lock(&con->mutex);
-        if (con->in_msg && con->in_msg == msg) {
-                unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
-                unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
-                unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
-                /* skip rest of message */
-                dout("con_revoke_pages %p msg %p revoked\n", con, msg);
-                        con->in_base_pos = con->in_base_pos -
-                                sizeof(struct ceph_msg_header) -
-                                front_len -
-                                middle_len -
-                                data_len -
-                                sizeof(struct ceph_msg_footer);
-                ceph_msg_put(con->in_msg);
-                con->in_msg = NULL;
-                con->in_tag = CEPH_MSGR_TAG_READY;
-                con->in_seq++;
-        } else {
-                dout("con_revoke_pages %p msg %p pages %p no-op\n",
-                     con, con->in_msg, msg);
-        }
-        mutex_unlock(&con->mutex);
-}
-/*
- * Queue a keepalive byte to ensure the tcp connection is alive.
- */
-void ceph_con_keepalive(struct ceph_connection *con)
-{
-        if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
-            test_and_set_bit(WRITE_PENDING, &con->state) == 0)
-                queue_con(con);
-}
-/*
- * construct a new message with given type, size
- * the new msg has a ref count of 1.
- */
-struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
-{
-        struct ceph_msg *m;
-        m = kmalloc(sizeof(*m), flags);
-        if (m == NULL)
-                goto out;
-        kref_init(&m->kref);
-        INIT_LIST_HEAD(&m->list_head);
-        m->hdr.tid = 0;
-        m->hdr.type = cpu_to_le16(type);
-        m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
-        m->hdr.version = 0;
-        m->hdr.front_len = cpu_to_le32(front_len);
-        m->hdr.middle_len = 0;
-        m->hdr.data_len = 0;
-        m->hdr.data_off = 0;
-        m->hdr.reserved = 0;
-        m->footer.front_crc = 0;
-        m->footer.middle_crc = 0;
-        m->footer.data_crc = 0;
-        m->footer.flags = 0;
-        m->front_max = front_len;
-        m->front_is_vmalloc = false;
-        m->more_to_follow = false;
-        m->pool = NULL;
-        /* front */
-        if (front_len) {
-                if (front_len > PAGE_CACHE_SIZE) {
-                        m->front.iov_base = __vmalloc(front_len, flags,
-                                                      PAGE_KERNEL);
-                        m->front_is_vmalloc = true;
-                } else {
-                        m->front.iov_base = kmalloc(front_len, flags);
-                }
-                if (m->front.iov_base == NULL) {
-                        pr_err("msg_new can't allocate %d bytes\n",
-                             front_len);
-                        goto out2;
-                }
-        } else {
-                m->front.iov_base = NULL;
-        }
-        m->front.iov_len = front_len;
-        /* middle */
-        m->middle = NULL;
-        /* data */
-        m->nr_pages = 0;
-        m->pages = NULL;
-        m->pagelist = NULL;
-        dout("ceph_msg_new %p front %d\n", m, front_len);
-        return m;
-out2:
-        ceph_msg_put(m);
-out:
-        pr_err("msg_new can't create type %d front %d\n", type, front_len);
-        return NULL;
-}
-/*
- * Allocate "middle" portion of a message, if it is needed and wasn't
- * allocated by alloc_msg.  This allows us to read a small fixed-size
- * per-type header in the front and then gracefully fail (i.e.,
- * propagate the error to the caller based on info in the front) when
- * the middle is too large.
- */
-static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        int type = le16_to_cpu(msg->hdr.type);
-        int middle_len = le32_to_cpu(msg->hdr.middle_len);
-        dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
-             ceph_msg_type_name(type), middle_len);
-        BUG_ON(!middle_len);
-        BUG_ON(msg->middle);
-        msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
-        if (!msg->middle)
-                return -ENOMEM;
-        return 0;
-}
-/*
- * Generic message allocator, for incoming messages.
- */
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-                                struct ceph_msg_header *hdr,
-                                int *skip)
-{
-        int type = le16_to_cpu(hdr->type);
-        int front_len = le32_to_cpu(hdr->front_len);
-        int middle_len = le32_to_cpu(hdr->middle_len);
-        struct ceph_msg *msg = NULL;
-        int ret;
-        if (con->ops->alloc_msg) {
-                mutex_unlock(&con->mutex);
-                msg = con->ops->alloc_msg(con, hdr, skip);
-                mutex_lock(&con->mutex);
-                if (!msg || *skip)
-                        return NULL;
-        }
-        if (!msg) {
-                *skip = 0;
-                msg = ceph_msg_new(type, front_len, GFP_NOFS);
-                if (!msg) {
-                        pr_err("unable to allocate msg type %d len %d\n",
-                               type, front_len);
-                        return NULL;
-                }
-        }
-        memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
-        if (middle_len && !msg->middle) {
-                ret = ceph_alloc_middle(con, msg);
-                if (ret < 0) {
-                        ceph_msg_put(msg);
-                        return NULL;
-                }
-        }
-        return msg;
-}
-/*
- * Free a generically kmalloc'd message.
- */
-void ceph_msg_kfree(struct ceph_msg *m)
-{
-        dout("msg_kfree %p\n", m);
-        if (m->front_is_vmalloc)
-                vfree(m->front.iov_base);
-        else
-                kfree(m->front.iov_base);
-        kfree(m);
-}
-/*
- * Drop a msg ref.  Destroy as needed.
- */
-void ceph_msg_last_put(struct kref *kref)
-{
-        struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
-        dout("ceph_msg_put last one on %p\n", m);
-        WARN_ON(!list_empty(&m->list_head));
-        /* drop middle, data, if any */
-        if (m->middle) {
-                ceph_buffer_put(m->middle);
-                m->middle = NULL;
-        }
-        m->nr_pages = 0;
-        m->pages = NULL;
-        if (m->pagelist) {
-                ceph_pagelist_release(m->pagelist);
-                kfree(m->pagelist);
-                m->pagelist = NULL;
-        }
-        if (m->pool)
-                ceph_msgpool_put(m->pool, m);
-        else
-                ceph_msg_kfree(m);
-}
-void ceph_msg_dump(struct ceph_msg *msg)
-{
-        pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
-                 msg->front_max, msg->nr_pages);
-        print_hex_dump(KERN_DEBUG, "header: ",
-                       DUMP_PREFIX_OFFSET, 16, 1,
-                       &msg->hdr, sizeof(msg->hdr), true);
-        print_hex_dump(KERN_DEBUG, " front: ",
-                       DUMP_PREFIX_OFFSET, 16, 1,
-                       msg->front.iov_base, msg->front.iov_len, true);
-        if (msg->middle)
-                print_hex_dump(KERN_DEBUG, "middle: ",
-                               DUMP_PREFIX_OFFSET, 16, 1,
-                               msg->middle->vec.iov_base,
-                               msg->middle->vec.iov_len, true);
-        print_hex_dump(KERN_DEBUG, "footer: ",
-                       DUMP_PREFIX_OFFSET, 16, 1,
-                       &msg->footer, sizeof(msg->footer), true);
-}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
deleted file mode 100644
index 76fbc957bc13..000000000000
--- a/fs/ceph/messenger.h
+++ /dev/null
@@ -1,253 +0,0 @@
-#ifndef __FS_CEPH_MESSENGER_H
-#define __FS_CEPH_MESSENGER_H
-#include <linux/kref.h>
-#include <linux/mutex.h>
-#include <linux/net.h>
-#include <linux/radix-tree.h>
-#include <linux/uio.h>
-#include <linux/version.h>
-#include <linux/workqueue.h>
-#include "types.h"
-#include "buffer.h"
-struct ceph_msg;
-struct ceph_connection;
-extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
-/*
- * Ceph defines these callbacks for handling connection events.
- */
-struct ceph_connection_operations {
-        struct ceph_connection *(*get)(struct ceph_connection *);
-        void (*put)(struct ceph_connection *);
-        /* handle an incoming message. */
-        void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
-        /* authorize an outgoing connection */
-        int (*get_authorizer) (struct ceph_connection *con,
-                               void **buf, int *len, int *proto,
-                               void **reply_buf, int *reply_len, int force_new);
-        int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
-        int (*invalidate_authorizer)(struct ceph_connection *con);
-        /* protocol version mismatch */
-        void (*bad_proto) (struct ceph_connection *con);
-        /* there was some error on the socket (disconnect, whatever) */
-        void (*fault) (struct ceph_connection *con);
-        /* a remote host as terminated a message exchange session, and messages
-         * we sent (or they tried to send us) may be lost. */
-        void (*peer_reset) (struct ceph_connection *con);
-        struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
-                                        struct ceph_msg_header *hdr,
-                                        int *skip);
-};
-/* use format string %s%d */
-#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
-struct ceph_messenger {
-        struct ceph_entity_inst inst;    /* my name+address */
-        struct ceph_entity_addr my_enc_addr;
-        struct page *zero_page;          /* used in certain error cases */
-        bool nocrc;
-        /*
-         * the global_seq counts connections i (attempt to) initiate
-         * in order to disambiguate certain connect race conditions.
-         */
-        u32 global_seq;
-        spinlock_t global_seq_lock;
-};
-/*
- * a single message.  it contains a header (src, dest, message type, etc.),
- * footer (crc values, mainly), a "front" message body, and possibly a
- * data payload (stored in some number of pages).
- */
-struct ceph_msg {
-        struct ceph_msg_header hdr;     /* header */
-        struct ceph_msg_footer footer;  /* footer */
-        struct kvec front;              /* unaligned blobs of message */
-        struct ceph_buffer *middle;
-        struct page **pages;            /* data payload.  NOT OWNER. */
-        unsigned nr_pages;              /* size of page array */
-        struct ceph_pagelist *pagelist; /* instead of pages */
-        struct list_head list_head;
-        struct kref kref;
-        bool front_is_vmalloc;
-        bool more_to_follow;
-        bool needs_out_seq;
-        int front_max;
-        struct ceph_msgpool *pool;
-};
-struct ceph_msg_pos {
-        int page, page_pos;  /* which page; offset in page */
-        int data_pos;        /* offset in data payload */
-        int did_page_crc;    /* true if we've calculated crc for current page */
-};
-/* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL     (HZ/2)
-#define MAX_DELAY_INTERVAL      (5 * 60 * HZ)
-/*
- * ceph_connection state bit flags
- *
- * QUEUED and BUSY are used together to ensure that only a single
- * thread is currently opening, reading or writing data to the socket.
- */
-#define LOSSYTX         0  /* we can close channel or drop messages on errors */
-#define CONNECTING      1
-#define NEGOTIATING     2
-#define KEEPALIVE_PENDING      3
-#define WRITE_PENDING   4  /* we have data ready to send */
-#define QUEUED          5  /* there is work queued on this connection */
-#define BUSY            6  /* work is being done */
-#define STANDBY         8  /* no outgoing messages, socket closed.  we keep
-                            * the ceph_connection around to maintain shared
-                            * state with the peer. */
-#define CLOSED          10 /* we've closed the connection */
-#define SOCK_CLOSED     11 /* socket state changed to closed */
-#define OPENING         13 /* open connection w/ (possibly new) peer */
-#define DEAD            14 /* dead, about to kfree */
-/*
- * A single connection with another host.
- *
- * We maintain a queue of outgoing messages, and some session state to
- * ensure that we can preserve the lossless, ordered delivery of
- * messages in the case of a TCP disconnect.
- */
-struct ceph_connection {
-        void *private;
-        atomic_t nref;
-        const struct ceph_connection_operations *ops;
-        struct ceph_messenger *msgr;
-        struct socket *sock;
-        unsigned long state;    /* connection state (see flags above) */
-        const char *error_msg;  /* error message, if any */
-        struct ceph_entity_addr peer_addr; /* peer address */
-        struct ceph_entity_name peer_name; /* peer name */
-        struct ceph_entity_addr peer_addr_for_me;
-        unsigned peer_features;
-        u32 connect_seq;      /* identify the most recent connection
-                                 attempt for this connection, client */
-        u32 peer_global_seq;  /* peer's global seq for this connection */
-        int auth_retry;       /* true if we need a newer authorizer */
-        void *auth_reply_buf;   /* where to put the authorizer reply */
-        int auth_reply_buf_len;
-        struct mutex mutex;
-        /* out queue */
-        struct list_head out_queue;
-        struct list_head out_sent;   /* sending or sent but unacked */
-        u64 out_seq;                 /* last message queued for send */
-        bool out_keepalive_pending;
-        u64 in_seq, in_seq_acked;  /* last message received, acked */
-        /* connection negotiation temps */
-        char in_banner[CEPH_BANNER_MAX_LEN];
-        union {
-                struct {  /* outgoing connection */
-                        struct ceph_msg_connect out_connect;
-                        struct ceph_msg_connect_reply in_reply;
-                };
-                struct {  /* incoming */
-                        struct ceph_msg_connect in_connect;
-                        struct ceph_msg_connect_reply out_reply;
-                };
-        };
-        struct ceph_entity_addr actual_peer_addr;
-        /* message out temps */
-        struct ceph_msg *out_msg;        /* sending message (== tail of
-                                            out_sent) */
-        bool out_msg_done;
-        struct ceph_msg_pos out_msg_pos;
-        struct kvec out_kvec[8],         /* sending header/footer data */
-                *out_kvec_cur;
-        int out_kvec_left;   /* kvec's left in out_kvec */
-        int out_skip;        /* skip this many bytes */
-        int out_kvec_bytes;  /* total bytes left */
-        bool out_kvec_is_msg; /* kvec refers to out_msg */
-        int out_more;        /* there is more data after the kvecs */
-        __le64 out_temp_ack; /* for writing an ack */
-        /* message in temps */
-        struct ceph_msg_header in_hdr;
-        struct ceph_msg *in_msg;
-        struct ceph_msg_pos in_msg_pos;
-        u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
-        char in_tag;         /* protocol control byte */
-        int in_base_pos;     /* bytes read */
-        __le64 in_temp_ack;  /* for reading an ack */
-        struct delayed_work work;           /* send|recv work */
-        unsigned long       delay;          /* current delay interval */
-};
-extern const char *pr_addr(const struct sockaddr_storage *ss);
-extern int ceph_parse_ips(const char *c, const char *end,
-                          struct ceph_entity_addr *addr,
-                          int max_count, int *count);
-extern int ceph_msgr_init(void);
-extern void ceph_msgr_exit(void);
-extern void ceph_msgr_flush(void);
-extern struct ceph_messenger *ceph_messenger_create(
-        struct ceph_entity_addr *myaddr);
-extern void ceph_messenger_destroy(struct ceph_messenger *);
-extern void ceph_con_init(struct ceph_messenger *msgr,
-                          struct ceph_connection *con);
-extern void ceph_con_open(struct ceph_connection *con,
-                          struct ceph_entity_addr *addr);
-extern bool ceph_con_opened(struct ceph_connection *con);
-extern void ceph_con_close(struct ceph_connection *con);
-extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke_message(struct ceph_connection *con,
-                                  struct ceph_msg *msg);
-extern void ceph_con_keepalive(struct ceph_connection *con);
-extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
-extern void ceph_con_put(struct ceph_connection *con);
-extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
-extern void ceph_msg_kfree(struct ceph_msg *m);
-static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
-{
-        kref_get(&msg->kref);
-        return msg;
-}
-extern void ceph_msg_last_put(struct kref *kref);
-static inline void ceph_msg_put(struct ceph_msg *msg)
-{
-        kref_put(&msg->kref, ceph_msg_last_put);
-}
-extern void ceph_msg_dump(struct ceph_msg *msg);
-#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
deleted file mode 100644
index b2a5a3e4a671..000000000000
--- a/fs/ceph/mon_client.c
+++ /dev/null
@@ -1,1018 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-#include "mon_client.h"
-#include "super.h"
-#include "auth.h"
-#include "decode.h"
-/*
- * Interact with Ceph monitor cluster.  Handle requests for new map
- * versions, and periodically resend as needed.  Also implement
- * statfs() and umount().
- *
- * A small cluster of Ceph "monitors" are responsible for managing critical
- * cluster configuration and state information.  An odd number (e.g., 3, 5)
- * of cmon daemons use a modified version of the Paxos part-time parliament
- * algorithm to manage the MDS map (mds cluster membership), OSD map, and
- * list of clients who have mounted the file system.
- *
- * We maintain an open, active session with a monitor at all times in order to
- * receive timely MDSMap updates.  We periodically send a keepalive byte on the
- * TCP socket to ensure we detect a failure.  If the connection does break, we
- * randomly hunt for a new monitor.  Once the connection is reestablished, we
- * resend any outstanding requests.
- */
-static const struct ceph_connection_operations mon_con_ops;
-static int __validate_auth(struct ceph_mon_client *monc);
-/*
- * Decode a monmap blob (e.g., during mount).
- */
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
-{
-        struct ceph_monmap *m = NULL;
-        int i, err = -EINVAL;
-        struct ceph_fsid fsid;
-        u32 epoch, num_mon;
-        u16 version;
-        u32 len;
-        ceph_decode_32_safe(&p, end, len, bad);
-        ceph_decode_need(&p, end, len, bad);
-        dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
-        ceph_decode_16_safe(&p, end, version, bad);
-        ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
-        ceph_decode_copy(&p, &fsid, sizeof(fsid));
-        epoch = ceph_decode_32(&p);
-        num_mon = ceph_decode_32(&p);
-        ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
-        if (num_mon >= CEPH_MAX_MON)
-                goto bad;
-        m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
-        if (m == NULL)
-                return ERR_PTR(-ENOMEM);
-        m->fsid = fsid;
-        m->epoch = epoch;
-        m->num_mon = num_mon;
-        ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
-        for (i = 0; i < num_mon; i++)
-                ceph_decode_addr(&m->mon_inst[i].addr);
-        dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
-             m->num_mon);
-        for (i = 0; i < m->num_mon; i++)
-                dout("monmap_decode  mon%d is %s\n", i,
-                     pr_addr(&m->mon_inst[i].addr.in_addr));
-        return m;
-bad:
-        dout("monmap_decode failed with %d\n", err);
-        kfree(m);
-        return ERR_PTR(err);
-}
-/*
- * return true if *addr is included in the monmap.
- */
-int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
-{
-        int i;
-        for (i = 0; i < m->num_mon; i++)
-                if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
-                        return 1;
-        return 0;
-}
-/*
- * Send an auth request.
- */
-static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
-{
-        monc->pending_auth = 1;
-        monc->m_auth->front.iov_len = len;
-        monc->m_auth->hdr.front_len = cpu_to_le32(len);
-        ceph_con_revoke(monc->con, monc->m_auth);
-        ceph_msg_get(monc->m_auth);  /* keep our ref */
-        ceph_con_send(monc->con, monc->m_auth);
-}
-/*
- * Close monitor session, if any.
- */
-static void __close_session(struct ceph_mon_client *monc)
-{
-        if (monc->con) {
-                dout("__close_session closing mon%d\n", monc->cur_mon);
-                ceph_con_revoke(monc->con, monc->m_auth);
-                ceph_con_close(monc->con);
-                monc->cur_mon = -1;
-                monc->pending_auth = 0;
-                ceph_auth_reset(monc->auth);
-        }
-}
-/*
- * Open a session with a (new) monitor.
- */
-static int __open_session(struct ceph_mon_client *monc)
-{
-        char r;
-        int ret;
-        if (monc->cur_mon < 0) {
-                get_random_bytes(&r, 1);
-                monc->cur_mon = r % monc->monmap->num_mon;
-                dout("open_session num=%d r=%d -> mon%d\n",
-                     monc->monmap->num_mon, r, monc->cur_mon);
-                monc->sub_sent = 0;
-                monc->sub_renew_after = jiffies;  /* i.e., expired */
-                monc->want_next_osdmap = !!monc->want_next_osdmap;
-                dout("open_session mon%d opening\n", monc->cur_mon);
-                monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
-                monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
-                ceph_con_open(monc->con,
-                              &monc->monmap->mon_inst[monc->cur_mon].addr);
-                /* initiatiate authentication handshake */
-                ret = ceph_auth_build_hello(monc->auth,
-                                            monc->m_auth->front.iov_base,
-                                            monc->m_auth->front_max);
-                __send_prepared_auth_request(monc, ret);
-        } else {
-                dout("open_session mon%d already open\n", monc->cur_mon);
-        }
-        return 0;
-}
-static bool __sub_expired(struct ceph_mon_client *monc)
-{
-        return time_after_eq(jiffies, monc->sub_renew_after);
-}
-/*
- * Reschedule delayed work timer.
- */
-static void __schedule_delayed(struct ceph_mon_client *monc)
-{
-        unsigned delay;
-        if (monc->cur_mon < 0 || __sub_expired(monc))
-                delay = 10 * HZ;
-        else
-                delay = 20 * HZ;
-        dout("__schedule_delayed after %u\n", delay);
-        schedule_delayed_work(&monc->delayed_work, delay);
-}
-/*
- * Send subscribe request for mdsmap and/or osdmap.
- */
-static void __send_subscribe(struct ceph_mon_client *monc)
-{
-        dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
-             (unsigned)monc->sub_sent, __sub_expired(monc),
-             monc->want_next_osdmap);
-        if ((__sub_expired(monc) && !monc->sub_sent) ||
-            monc->want_next_osdmap == 1) {
-                struct ceph_msg *msg = monc->m_subscribe;
-                struct ceph_mon_subscribe_item *i;
-                void *p, *end;
-                p = msg->front.iov_base;
-                end = p + msg->front_max;
-                dout("__send_subscribe to 'mdsmap' %u+\n",
-                     (unsigned)monc->have_mdsmap);
-                if (monc->want_next_osdmap) {
-                        dout("__send_subscribe to 'osdmap' %u\n",
-                             (unsigned)monc->have_osdmap);
-                        ceph_encode_32(&p, 3);
-                        ceph_encode_string(&p, end, "osdmap", 6);
-                        i = p;
-                        i->have = cpu_to_le64(monc->have_osdmap);
-                        i->onetime = 1;
-                        p += sizeof(*i);
-                        monc->want_next_osdmap = 2;  /* requested */
-                } else {
-                        ceph_encode_32(&p, 2);
-                }
-                ceph_encode_string(&p, end, "mdsmap", 6);
-                i = p;
-                i->have = cpu_to_le64(monc->have_mdsmap);
-                i->onetime = 0;
-                p += sizeof(*i);
-                ceph_encode_string(&p, end, "monmap", 6);
-                i = p;
-                i->have = 0;
-                i->onetime = 0;
-                p += sizeof(*i);
-                msg->front.iov_len = p - msg->front.iov_base;
-                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-                ceph_con_revoke(monc->con, msg);
-                ceph_con_send(monc->con, ceph_msg_get(msg));
-                monc->sub_sent = jiffies | 1;  /* never 0 */
-        }
-}
-static void handle_subscribe_ack(struct ceph_mon_client *monc,
-                                 struct ceph_msg *msg)
-{
-        unsigned seconds;
-        struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
-        if (msg->front.iov_len < sizeof(*h))
-                goto bad;
-        seconds = le32_to_cpu(h->duration);
-        mutex_lock(&monc->mutex);
-        if (monc->hunting) {
-                pr_info("mon%d %s session established\n",
-                        monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
-                monc->hunting = false;
-        }
-        dout("handle_subscribe_ack after %d seconds\n", seconds);
-        monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
-        monc->sub_sent = 0;
-        mutex_unlock(&monc->mutex);
-        return;
-bad:
-        pr_err("got corrupt subscribe-ack msg\n");
-        ceph_msg_dump(msg);
-}
-/*
- * Keep track of which maps we have
- */
-int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
-{
-        mutex_lock(&monc->mutex);
-        monc->have_mdsmap = got;
-        mutex_unlock(&monc->mutex);
-        return 0;
-}
-int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
-{
-        mutex_lock(&monc->mutex);
-        monc->have_osdmap = got;
-        monc->want_next_osdmap = 0;
-        mutex_unlock(&monc->mutex);
-        return 0;
-}
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
-{
-        dout("request_next_osdmap have %u\n", monc->have_osdmap);
-        mutex_lock(&monc->mutex);
-        if (!monc->want_next_osdmap)
-                monc->want_next_osdmap = 1;
-        if (monc->want_next_osdmap < 2)
-                __send_subscribe(monc);
-        mutex_unlock(&monc->mutex);
-}
-/*
- *
- */
-int ceph_monc_open_session(struct ceph_mon_client *monc)
-{
-        if (!monc->con) {
-                monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
-                if (!monc->con)
-                        return -ENOMEM;
-                ceph_con_init(monc->client->msgr, monc->con);
-                monc->con->private = monc;
-                monc->con->ops = &mon_con_ops;
-        }
-        mutex_lock(&monc->mutex);
-        __open_session(monc);
-        __schedule_delayed(monc);
-        mutex_unlock(&monc->mutex);
-        return 0;
-}
-/*
- * The monitor responds with mount ack indicate mount success.  The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
-static void ceph_monc_handle_map(struct ceph_mon_client *monc,
-                                 struct ceph_msg *msg)
-{
-        struct ceph_client *client = monc->client;
-        struct ceph_monmap *monmap = NULL, *old = monc->monmap;
-        void *p, *end;
-        mutex_lock(&monc->mutex);
-        dout("handle_monmap\n");
-        p = msg->front.iov_base;
-        end = p + msg->front.iov_len;
-        monmap = ceph_monmap_decode(p, end);
-        if (IS_ERR(monmap)) {
-                pr_err("problem decoding monmap, %d\n",
-                       (int)PTR_ERR(monmap));
-                goto out;
-        }
-        if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
-                kfree(monmap);
-                goto out;
-        }
-        client->monc.monmap = monmap;
-        kfree(old);
-out:
-        mutex_unlock(&monc->mutex);
-        wake_up_all(&client->auth_wq);
-}
-/*
- * generic requests (e.g., statfs, poolop)
- */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-        struct ceph_mon_client *monc, u64 tid)
-{
-        struct ceph_mon_generic_request *req;
-        struct rb_node *n = monc->generic_request_tree.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_mon_generic_request, node);
-                if (tid < req->tid)
-                        n = n->rb_left;
-                else if (tid > req->tid)
-                        n = n->rb_right;
-                else
-                        return req;
-        }
-        return NULL;
-}
-static void __insert_generic_request(struct ceph_mon_client *monc,
-                            struct ceph_mon_generic_request *new)
-{
-        struct rb_node **p = &monc->generic_request_tree.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_mon_generic_request *req = NULL;
-        while (*p) {
-                parent = *p;
-                req = rb_entry(parent, struct ceph_mon_generic_request, node);
-                if (new->tid < req->tid)
-                        p = &(*p)->rb_left;
-                else if (new->tid > req->tid)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, &monc->generic_request_tree);
-}
-static void release_generic_request(struct kref *kref)
-{
-        struct ceph_mon_generic_request *req =
-                container_of(kref, struct ceph_mon_generic_request, kref);
-        if (req->reply)
-                ceph_msg_put(req->reply);
-        if (req->request)
-                ceph_msg_put(req->request);
-        kfree(req);
-}
-static void put_generic_request(struct ceph_mon_generic_request *req)
-{
-        kref_put(&req->kref, release_generic_request);
-}
-static void get_generic_request(struct ceph_mon_generic_request *req)
-{
-        kref_get(&req->kref);
-}
-static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
-                                         struct ceph_msg_header *hdr,
-                                         int *skip)
-{
-        struct ceph_mon_client *monc = con->private;
-        struct ceph_mon_generic_request *req;
-        u64 tid = le64_to_cpu(hdr->tid);
-        struct ceph_msg *m;
-        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, tid);
-        if (!req) {
-                dout("get_generic_reply %lld dne\n", tid);
-                *skip = 1;
-                m = NULL;
-        } else {
-                dout("get_generic_reply %lld got %p\n", tid, req->reply);
-                m = ceph_msg_get(req->reply);
-                /*
-                 * we don't need to track the connection reading into
-                 * this reply because we only have one open connection
-                 * at a time, ever.
-                 */
-        }
-        mutex_unlock(&monc->mutex);
-        return m;
-}
-static int do_generic_request(struct ceph_mon_client *monc,
-                              struct ceph_mon_generic_request *req)
-{
-        int err;
-        /* register request */
-        mutex_lock(&monc->mutex);
-        req->tid = ++monc->last_tid;
-        req->request->hdr.tid = cpu_to_le64(req->tid);
-        __insert_generic_request(monc, req);
-        monc->num_generic_requests++;
-        ceph_con_send(monc->con, ceph_msg_get(req->request));
-        mutex_unlock(&monc->mutex);
-        err = wait_for_completion_interruptible(&req->completion);
-        mutex_lock(&monc->mutex);
-        rb_erase(&req->node, &monc->generic_request_tree);
-        monc->num_generic_requests--;
-        mutex_unlock(&monc->mutex);
-        if (!err)
-                err = req->result;
-        return err;
-}
-/*
- * statfs
- */
-static void handle_statfs_reply(struct ceph_mon_client *monc,
-                                struct ceph_msg *msg)
-{
-        struct ceph_mon_generic_request *req;
-        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-        u64 tid = le64_to_cpu(msg->hdr.tid);
-        if (msg->front.iov_len != sizeof(*reply))
-                goto bad;
-        dout("handle_statfs_reply %p tid %llu\n", msg, tid);
-        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, tid);
-        if (req) {
-                *(struct ceph_statfs *)req->buf = reply->st;
-                req->result = 0;
-                get_generic_request(req);
-        }
-        mutex_unlock(&monc->mutex);
-        if (req) {
-                complete_all(&req->completion);
-                put_generic_request(req);
-        }
-        return;
-bad:
-        pr_err("corrupt generic reply, tid %llu\n", tid);
-        ceph_msg_dump(msg);
-}
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
-        struct ceph_mon_generic_request *req;
-        struct ceph_mon_statfs *h;
-        int err;
-        req = kzalloc(sizeof(*req), GFP_NOFS);
-        if (!req)
-                return -ENOMEM;
-        kref_init(&req->kref);
-        req->buf = buf;
-        req->buf_len = sizeof(*buf);
-        init_completion(&req->completion);
-        err = -ENOMEM;
-        req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
-        if (!req->request)
-                goto out;
-        req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
-        if (!req->reply)
-                goto out;
-        /* fill out request */
-        h = req->request->front.iov_base;
-        h->monhdr.have_version = 0;
-        h->monhdr.session_mon = cpu_to_le16(-1);
-        h->monhdr.session_mon_tid = 0;
-        h->fsid = monc->monmap->fsid;
-        err = do_generic_request(monc, req);
-out:
-        kref_put(&req->kref, release_generic_request);
-        return err;
-}
-/*
- * pool ops
- */
-static int get_poolop_reply_buf(const char *src, size_t src_len,
-                                char *dst, size_t dst_len)
-{
-        u32 buf_len;
-        if (src_len != sizeof(u32) + dst_len)
-                return -EINVAL;
-        buf_len = le32_to_cpu(*(u32 *)src);
-        if (buf_len != dst_len)
-                return -EINVAL;
-        memcpy(dst, src + sizeof(u32), dst_len);
-        return 0;
-}
-static void handle_poolop_reply(struct ceph_mon_client *monc,
-                                struct ceph_msg *msg)
-{
-        struct ceph_mon_generic_request *req;
-        struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
-        u64 tid = le64_to_cpu(msg->hdr.tid);
-        if (msg->front.iov_len < sizeof(*reply))
-                goto bad;
-        dout("handle_poolop_reply %p tid %llu\n", msg, tid);
-        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, tid);
-        if (req) {
-                if (req->buf_len &&
-                    get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
-                                     msg->front.iov_len - sizeof(*reply),
-                                     req->buf, req->buf_len) < 0) {
-                        mutex_unlock(&monc->mutex);
-                        goto bad;
-                }
-                req->result = le32_to_cpu(reply->reply_code);
-                get_generic_request(req);
-        }
-        mutex_unlock(&monc->mutex);
-        if (req) {
-                complete(&req->completion);
-                put_generic_request(req);
-        }
-        return;
-bad:
-        pr_err("corrupt generic reply, tid %llu\n", tid);
-        ceph_msg_dump(msg);
-}
-/*
- * Do a synchronous pool op.
- */
-int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
-                        u32 pool, u64 snapid,
-                        char *buf, int len)
-{
-        struct ceph_mon_generic_request *req;
-        struct ceph_mon_poolop *h;
-        int err;
-        req = kzalloc(sizeof(*req), GFP_NOFS);
-        if (!req)
-                return -ENOMEM;
-        kref_init(&req->kref);
-        req->buf = buf;
-        req->buf_len = len;
-        init_completion(&req->completion);
-        err = -ENOMEM;
-        req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
-        if (!req->request)
-                goto out;
-        req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
-        if (!req->reply)
-                goto out;
-        /* fill out request */
-        req->request->hdr.version = cpu_to_le16(2);
-        h = req->request->front.iov_base;
-        h->monhdr.have_version = 0;
-        h->monhdr.session_mon = cpu_to_le16(-1);
-        h->monhdr.session_mon_tid = 0;
-        h->fsid = monc->monmap->fsid;
-        h->pool = cpu_to_le32(pool);
-        h->op = cpu_to_le32(op);
-        h->auid = 0;
-        h->snapid = cpu_to_le64(snapid);
-        h->name_len = 0;
-        err = do_generic_request(monc, req);
-out:
-        kref_put(&req->kref, release_generic_request);
-        return err;
-}
-int ceph_monc_create_snapid(struct ceph_mon_client *monc,
-                            u32 pool, u64 *snapid)
-{
-        return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-                                   pool, 0, (char *)snapid, sizeof(*snapid));
-}
-int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
-                            u32 pool, u64 snapid)
-{
-        return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-                                   pool, snapid, 0, 0);
-}
-/*
- * Resend pending generic requests.
- */
-static void __resend_generic_request(struct ceph_mon_client *monc)
-{
-        struct ceph_mon_generic_request *req;
-        struct rb_node *p;
-        for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
-                req = rb_entry(p, struct ceph_mon_generic_request, node);
-                ceph_con_revoke(monc->con, req->request);
-                ceph_con_send(monc->con, ceph_msg_get(req->request));
-        }
-}
-/*
- * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
- * renew/retry subscription as needed (in case it is timing out, or we
- * got an ENOMEM).  And keep the monitor connection alive.
- */
-static void delayed_work(struct work_struct *work)
-{
-        struct ceph_mon_client *monc =
-                container_of(work, struct ceph_mon_client, delayed_work.work);
-        dout("monc delayed_work\n");
-        mutex_lock(&monc->mutex);
-        if (monc->hunting) {
-                __close_session(monc);
-                __open_session(monc);  /* continue hunting */
-        } else {
-                ceph_con_keepalive(monc->con);
-                __validate_auth(monc);
-                if (monc->auth->ops->is_authenticated(monc->auth))
-                        __send_subscribe(monc);
-        }
-        __schedule_delayed(monc);
-        mutex_unlock(&monc->mutex);
-}
-/*
- * On startup, we build a temporary monmap populated with the IPs
- * provided by mount(2).
- */
-static int build_initial_monmap(struct ceph_mon_client *monc)
-{
-        struct ceph_mount_args *args = monc->client->mount_args;
-        struct ceph_entity_addr *mon_addr = args->mon_addr;
-        int num_mon = args->num_mon;
-        int i;
-        /* build initial monmap */
-        monc->monmap = kzalloc(sizeof(*monc->monmap) +
-                               num_mon*sizeof(monc->monmap->mon_inst[0]),
-                               GFP_KERNEL);
-        if (!monc->monmap)
-                return -ENOMEM;
-        for (i = 0; i < num_mon; i++) {
-                monc->monmap->mon_inst[i].addr = mon_addr[i];
-                monc->monmap->mon_inst[i].addr.nonce = 0;
-                monc->monmap->mon_inst[i].name.type =
-                        CEPH_ENTITY_TYPE_MON;
-                monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
-        }
-        monc->monmap->num_mon = num_mon;
-        monc->have_fsid = false;
-        /* release addr memory */
-        kfree(args->mon_addr);
-        args->mon_addr = NULL;
-        args->num_mon = 0;
-        return 0;
-}
-int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
-{
-        int err = 0;
-        dout("init\n");
-        memset(monc, 0, sizeof(*monc));
-        monc->client = cl;
-        monc->monmap = NULL;
-        mutex_init(&monc->mutex);
-        err = build_initial_monmap(monc);
-        if (err)
-                goto out;
-        monc->con = NULL;
-        /* authentication */
-        monc->auth = ceph_auth_init(cl->mount_args->name,
-                                    cl->mount_args->secret);
-        if (IS_ERR(monc->auth))
-                return PTR_ERR(monc->auth);
-        monc->auth->want_keys =
-                CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
-                CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
-        /* msgs */
-        err = -ENOMEM;
-        monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-                                     sizeof(struct ceph_mon_subscribe_ack),
-                                     GFP_NOFS);
-        if (!monc->m_subscribe_ack)
-                goto out_monmap;
-        monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
-        if (!monc->m_subscribe)
-                goto out_subscribe_ack;
-        monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
-        if (!monc->m_auth_reply)
-                goto out_subscribe;
-        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
-        monc->pending_auth = 0;
-        if (!monc->m_auth)
-                goto out_auth_reply;
-        monc->cur_mon = -1;
-        monc->hunting = true;
-        monc->sub_renew_after = jiffies;
-        monc->sub_sent = 0;
-        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-        monc->generic_request_tree = RB_ROOT;
-        monc->num_generic_requests = 0;
-        monc->last_tid = 0;
-        monc->have_mdsmap = 0;
-        monc->have_osdmap = 0;
-        monc->want_next_osdmap = 1;
-        return 0;
-out_auth_reply:
-        ceph_msg_put(monc->m_auth_reply);
-out_subscribe:
-        ceph_msg_put(monc->m_subscribe);
-out_subscribe_ack:
-        ceph_msg_put(monc->m_subscribe_ack);
-out_monmap:
-        kfree(monc->monmap);
-out:
-        return err;
-}
-void ceph_monc_stop(struct ceph_mon_client *monc)
-{
-        dout("stop\n");
-        cancel_delayed_work_sync(&monc->delayed_work);
-        mutex_lock(&monc->mutex);
-        __close_session(monc);
-        if (monc->con) {
-                monc->con->private = NULL;
-                monc->con->ops->put(monc->con);
-                monc->con = NULL;
-        }
-        mutex_unlock(&monc->mutex);
-        ceph_auth_destroy(monc->auth);
-        ceph_msg_put(monc->m_auth);
-        ceph_msg_put(monc->m_auth_reply);
-        ceph_msg_put(monc->m_subscribe);
-        ceph_msg_put(monc->m_subscribe_ack);
-        kfree(monc->monmap);
-}
-static void handle_auth_reply(struct ceph_mon_client *monc,
-                              struct ceph_msg *msg)
-{
-        int ret;
-        int was_auth = 0;
-        mutex_lock(&monc->mutex);
-        if (monc->auth->ops)
-                was_auth = monc->auth->ops->is_authenticated(monc->auth);
-        monc->pending_auth = 0;
-        ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
-                                     msg->front.iov_len,
-                                     monc->m_auth->front.iov_base,
-                                     monc->m_auth->front_max);
-        if (ret < 0) {
-                monc->client->auth_err = ret;
-                wake_up_all(&monc->client->auth_wq);
-        } else if (ret > 0) {
-                __send_prepared_auth_request(monc, ret);
-        } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
-                dout("authenticated, starting session\n");
-                monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-                monc->client->msgr->inst.name.num =
-                                        cpu_to_le64(monc->auth->global_id);
-                __send_subscribe(monc);
-                __resend_generic_request(monc);
-        }
-        mutex_unlock(&monc->mutex);
-}
-static int __validate_auth(struct ceph_mon_client *monc)
-{
-        int ret;
-        if (monc->pending_auth)
-                return 0;
-        ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
-                              monc->m_auth->front_max);
-        if (ret <= 0)
-                return ret; /* either an error, or no need to authenticate */
-        __send_prepared_auth_request(monc, ret);
-        return 0;
-}
-int ceph_monc_validate_auth(struct ceph_mon_client *monc)
-{
-        int ret;
-        mutex_lock(&monc->mutex);
-        ret = __validate_auth(monc);
-        mutex_unlock(&monc->mutex);
-        return ret;
-}
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        struct ceph_mon_client *monc = con->private;
-        int type = le16_to_cpu(msg->hdr.type);
-        if (!monc)
-                return;
-        switch (type) {
-        case CEPH_MSG_AUTH_REPLY:
-                handle_auth_reply(monc, msg);
-                break;
-        case CEPH_MSG_MON_SUBSCRIBE_ACK:
-                handle_subscribe_ack(monc, msg);
-                break;
-        case CEPH_MSG_STATFS_REPLY:
-                handle_statfs_reply(monc, msg);
-                break;
-        case CEPH_MSG_POOLOP_REPLY:
-                handle_poolop_reply(monc, msg);
-                break;
-        case CEPH_MSG_MON_MAP:
-                ceph_monc_handle_map(monc, msg);
-                break;
-        case CEPH_MSG_MDS_MAP:
-                ceph_mdsc_handle_map(&monc->client->mdsc, msg);
-                break;
-        case CEPH_MSG_OSD_MAP:
-                ceph_osdc_handle_map(&monc->client->osdc, msg);
-                break;
-        default:
-                pr_err("received unknown message type %d %s\n", type,
-                       ceph_msg_type_name(type));
-        }
-        ceph_msg_put(msg);
-}
-/*
- * Allocate memory for incoming message
- */
-static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
-                                      struct ceph_msg_header *hdr,
-                                      int *skip)
-{
-        struct ceph_mon_client *monc = con->private;
-        int type = le16_to_cpu(hdr->type);
-        int front_len = le32_to_cpu(hdr->front_len);
-        struct ceph_msg *m = NULL;
-        *skip = 0;
-        switch (type) {
-        case CEPH_MSG_MON_SUBSCRIBE_ACK:
-                m = ceph_msg_get(monc->m_subscribe_ack);
-                break;
-        case CEPH_MSG_POOLOP_REPLY:
-        case CEPH_MSG_STATFS_REPLY:
-                return get_generic_reply(con, hdr, skip);
-        case CEPH_MSG_AUTH_REPLY:
-                m = ceph_msg_get(monc->m_auth_reply);
-                break;
-        case CEPH_MSG_MON_MAP:
-        case CEPH_MSG_MDS_MAP:
-        case CEPH_MSG_OSD_MAP:
-                m = ceph_msg_new(type, front_len, GFP_NOFS);
-                break;
-        }
-        if (!m) {
-                pr_info("alloc_msg unknown type %d\n", type);
-                *skip = 1;
-        }
-        return m;
-}
-/*
- * If the monitor connection resets, pick a new monitor and resubmit
- * any pending requests.
- */
-static void mon_fault(struct ceph_connection *con)
-{
-        struct ceph_mon_client *monc = con->private;
-        if (!monc)
-                return;
-        dout("mon_fault\n");
-        mutex_lock(&monc->mutex);
-        if (!con->private)
-                goto out;
-        if (monc->con && !monc->hunting)
-                pr_info("mon%d %s session lost, "
-                        "hunting for new mon\n", monc->cur_mon,
-                        pr_addr(&monc->con->peer_addr.in_addr));
-        __close_session(monc);
-        if (!monc->hunting) {
-                /* start hunting */
-                monc->hunting = true;
-                __open_session(monc);
-        } else {
-                /* already hunting, let's wait a bit */
-                __schedule_delayed(monc);
-        }
-out:
-        mutex_unlock(&monc->mutex);
-}
-static const struct ceph_connection_operations mon_con_ops = {
-        .get = ceph_con_get,
-        .put = ceph_con_put,
-        .dispatch = dispatch,
-        .fault = mon_fault,
-        .alloc_msg = mon_alloc_msg,
-};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
deleted file mode 100644
index 8e396f2c0963..000000000000
--- a/fs/ceph/mon_client.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef _FS_CEPH_MON_CLIENT_H
-#define _FS_CEPH_MON_CLIENT_H
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/rbtree.h>
-#include "messenger.h"
-struct ceph_client;
-struct ceph_mount_args;
-struct ceph_auth_client;
-/*
- * The monitor map enumerates the set of all monitors.
- */
-struct ceph_monmap {
-        struct ceph_fsid fsid;
-        u32 epoch;
-        u32 num_mon;
-        struct ceph_entity_inst mon_inst[0];
-};
-struct ceph_mon_client;
-struct ceph_mon_generic_request;
-/*
- * Generic mechanism for resending monitor requests.
- */
-typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
-                                         int newmon);
-/* a pending monitor request */
-struct ceph_mon_request {
-        struct ceph_mon_client *monc;
-        struct delayed_work delayed_work;
-        unsigned long delay;
-        ceph_monc_request_func_t do_request;
-};
-/*
- * ceph_mon_generic_request is being used for the statfs and poolop requests
- * which are bening done a bit differently because we need to get data back
- * to the caller
- */
-struct ceph_mon_generic_request {
-        struct kref kref;
-        u64 tid;
-        struct rb_node node;
-        int result;
-        void *buf;
-        int buf_len;
-        struct completion completion;
-        struct ceph_msg *request;  /* original request */
-        struct ceph_msg *reply;    /* and reply */
-};
-struct ceph_mon_client {
-        struct ceph_client *client;
-        struct ceph_monmap *monmap;
-        struct mutex mutex;
-        struct delayed_work delayed_work;
-        struct ceph_auth_client *auth;
-        struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
-        int pending_auth;
-        bool hunting;
-        int cur_mon;                       /* last monitor i contacted */
-        unsigned long sub_sent, sub_renew_after;
-        struct ceph_connection *con;
-        bool have_fsid;
-        /* pending generic requests */
-        struct rb_root generic_request_tree;
-        int num_generic_requests;
-        u64 last_tid;
-        /* mds/osd map */
-        int want_next_osdmap; /* 1 = want, 2 = want+asked */
-        u32 have_osdmap, have_mdsmap;
-#ifdef CONFIG_DEBUG_FS
-        struct dentry *debugfs_file;
-#endif
-};
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
-extern int ceph_monmap_contains(struct ceph_monmap *m,
-                                struct ceph_entity_addr *addr);
-extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
-extern void ceph_monc_stop(struct ceph_mon_client *monc);
-/*
- * The model here is to indicate that we need a new map of at least
- * epoch @want, and also call in when we receive a map.  We will
- * periodically rerequest the map from the monitor cluster until we
- * get what we want.
- */
-extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
-extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
-extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
-                               struct ceph_statfs *buf);
-extern int ceph_monc_open_session(struct ceph_mon_client *monc);
-extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
-extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
-                                   u32 pool, u64 *snapid);
-extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
-                                   u32 pool, u64 snapid);
-#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
deleted file mode 100644
index dd65a6438131..000000000000
--- a/fs/ceph/msgpool.c
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/err.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/vmalloc.h>
-#include "msgpool.h"
-static void *alloc_fn(gfp_t gfp_mask, void *arg)
-{
-        struct ceph_msgpool *pool = arg;
-        void *p;
-        p = ceph_msg_new(0, pool->front_len, gfp_mask);
-        if (!p)
-                pr_err("msgpool %s alloc failed\n", pool->name);
-        return p;
-}
-static void free_fn(void *element, void *arg)
-{
-        ceph_msg_put(element);
-}
-int ceph_msgpool_init(struct ceph_msgpool *pool,
-                      int front_len, int size, bool blocking, const char *name)
-{
-        pool->front_len = front_len;
-        pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
-        if (!pool->pool)
-                return -ENOMEM;
-        pool->name = name;
-        return 0;
-}
-void ceph_msgpool_destroy(struct ceph_msgpool *pool)
-{
-        mempool_destroy(pool->pool);
-}
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
-                                  int front_len)
-{
-        if (front_len > pool->front_len) {
-                pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
-                       pool->name, front_len, pool->front_len);
-                WARN_ON(1);
-                /* try to alloc a fresh message */
-                return ceph_msg_new(0, front_len, GFP_NOFS);
-        }
-        return mempool_alloc(pool->pool, GFP_NOFS);
-}
-void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
-{
-        /* reset msg front_len; user may have changed it */
-        msg->front.iov_len = pool->front_len;
-        msg->hdr.front_len = cpu_to_le32(pool->front_len);
-        kref_init(&msg->kref);  /* retake single ref */
-}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
deleted file mode 100644
index a362605f9368..000000000000
--- a/fs/ceph/msgpool.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _FS_CEPH_MSGPOOL
-#define _FS_CEPH_MSGPOOL
-#include <linux/mempool.h>
-#include "messenger.h"
-/*
- * we use memory pools for preallocating messages we may receive, to
- * avoid unexpected OOM conditions.
- */
-struct ceph_msgpool {
-        const char *name;
-        mempool_t *pool;
-        int front_len;          /* preallocated payload size */
-};
-extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-                             int front_len, int size, bool blocking,
-                             const char *name);
-extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
-                                         int front_len);
-extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
-#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
deleted file mode 100644
index 680d3d648cac..000000000000
--- a/fs/ceph/msgr.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef CEPH_MSGR_H
-#define CEPH_MSGR_H
-/*
- * Data types for message passing layer used by Ceph.
- */
-#define CEPH_MON_PORT    6789  /* default monitor port */
-/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST  6789
-#define CEPH_PORT_START  6800  /* non-monitors start here */
-#define CEPH_PORT_LAST   6900
-/*
- * tcp connection banner.  include a protocol version. and adjust
- * whenever the wire protocol changes.  try to keep this string length
- * constant.
- */
-#define CEPH_BANNER "ceph v027"
-#define CEPH_BANNER_MAX_LEN 30
-/*
- * Rollover-safe type and comparator for 32-bit sequence numbers.
- * Comparator returns -1, 0, or 1.
- */
-typedef __u32 ceph_seq_t;
-static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
-{
-       return (__s32)a - (__s32)b;
-}
-/*
- * entity_name -- logical name for a process participating in the
- * network, e.g. 'mds0' or 'osd3'.
- */
-struct ceph_entity_name {
-        __u8 type;      /* CEPH_ENTITY_TYPE_* */
-        __le64 num;
-} __attribute__ ((packed));
-#define CEPH_ENTITY_TYPE_MON    0x01
-#define CEPH_ENTITY_TYPE_MDS    0x02
-#define CEPH_ENTITY_TYPE_OSD    0x04
-#define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_AUTH   0x20
-#define CEPH_ENTITY_TYPE_ANY    0xFF
-extern const char *ceph_entity_type_name(int type);
-/*
- * entity_addr -- network address
- */
-struct ceph_entity_addr {
-        __le32 type;
-        __le32 nonce;  /* unique id for process (e.g. pid) */
-        struct sockaddr_storage in_addr;
-} __attribute__ ((packed));
-struct ceph_entity_inst {
-        struct ceph_entity_name name;
-        struct ceph_entity_addr addr;
-} __attribute__ ((packed));
-/* used by message exchange protocol */
-#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
-#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
-#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
-                                          incoming connection */
-#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
-                                          with higher cseq */
-#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
-                                          with higher gseq */
-#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
-#define CEPH_MSGR_TAG_MSG           7  /* message */
-#define CEPH_MSGR_TAG_ACK           8  /* message ack */
-#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
-#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
-#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
-/*
- * connection negotiation
- */
-struct ceph_msg_connect {
-        __le64 features;     /* supported feature bits */
-        __le32 host_type;    /* CEPH_ENTITY_TYPE_* */
-        __le32 global_seq;   /* count connections initiated by this host */
-        __le32 connect_seq;  /* count connections initiated in this session */
-        __le32 protocol_version;
-        __le32 authorizer_protocol;
-        __le32 authorizer_len;
-        __u8  flags;         /* CEPH_MSG_CONNECT_* */
-} __attribute__ ((packed));
-struct ceph_msg_connect_reply {
-        __u8 tag;
-        __le64 features;     /* feature bits for this session */
-        __le32 global_seq;
-        __le32 connect_seq;
-        __le32 protocol_version;
-        __le32 authorizer_len;
-        __u8 flags;
-} __attribute__ ((packed));
-#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
-/*
- * message header
- */
-struct ceph_msg_header_old {
-        __le64 seq;       /* message seq# for this session */
-        __le64 tid;       /* transaction id */
-        __le16 type;      /* message type */
-        __le16 priority;  /* priority.  higher value == higher priority */
-        __le16 version;   /* version of message encoding */
-        __le32 front_len; /* bytes in main payload */
-        __le32 middle_len;/* bytes in middle payload */
-        __le32 data_len;  /* bytes of data payload */
-        __le16 data_off;  /* sender: include full offset;
-                             receiver: mask against ~PAGE_MASK */
-        struct ceph_entity_inst src, orig_src;
-        __le32 reserved;
-        __le32 crc;       /* header crc32c */
-} __attribute__ ((packed));
-struct ceph_msg_header {
-        __le64 seq;       /* message seq# for this session */
-        __le64 tid;       /* transaction id */
-        __le16 type;      /* message type */
-        __le16 priority;  /* priority.  higher value == higher priority */
-        __le16 version;   /* version of message encoding */
-        __le32 front_len; /* bytes in main payload */
-        __le32 middle_len;/* bytes in middle payload */
-        __le32 data_len;  /* bytes of data payload */
-        __le16 data_off;  /* sender: include full offset;
-                             receiver: mask against ~PAGE_MASK */
-        struct ceph_entity_name src;
-        __le32 reserved;
-        __le32 crc;       /* header crc32c */
-} __attribute__ ((packed));
-#define CEPH_MSG_PRIO_LOW     64
-#define CEPH_MSG_PRIO_DEFAULT 127
-#define CEPH_MSG_PRIO_HIGH    196
-#define CEPH_MSG_PRIO_HIGHEST 255
-/*
- * follows data payload
- */
-struct ceph_msg_footer {
-        __le32 front_crc, middle_crc, data_crc;
-        __u8 flags;
-} __attribute__ ((packed));
-#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
-#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
-#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
deleted file mode 100644
index 3b5571b8ce22..000000000000
--- a/fs/ceph/osd_client.c
+++ /dev/null
@@ -1,1539 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/err.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include "super.h"
-#include "osd_client.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-#define OSD_OP_FRONT_LEN        4096
-#define OSD_OPREPLY_FRONT_LEN   512
-static const struct ceph_connection_operations osd_con_ops;
-static int __kick_requests(struct ceph_osd_client *osdc,
-                          struct ceph_osd *kickosd);
-static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-/*
- * Implement client access to distributed object storage cluster.
- *
- * All data objects are stored within a cluster/cloud of OSDs, or
- * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
- * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
- * remote daemons serving up and coordinating consistent and safe
- * access to storage.
- *
- * Cluster membership and the mapping of data objects onto storage devices
- * are described by the osd map.
- *
- * We keep track of pending OSD requests (read, write), resubmit
- * requests to different OSDs when the cluster topology/data layout
- * change, or retry the affected requests when the communications
- * channel with an OSD is reset.
- */
-/*
- * calculate the mapping of a file extent onto an object, and fill out the
- * request accordingly.  shorten extent as necessary if it crosses an
- * object boundary.
- *
- * fill osd op in request message.
- */
-static void calc_layout(struct ceph_osd_client *osdc,
-                        struct ceph_vino vino, struct ceph_file_layout *layout,
-                        u64 off, u64 *plen,
-                        struct ceph_osd_request *req)
-{
-        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-        struct ceph_osd_op *op = (void *)(reqhead + 1);
-        u64 orig_len = *plen;
-        u64 objoff, objlen;    /* extent in object */
-        u64 bno;
-        reqhead->snapid = cpu_to_le64(vino.snap);
-        /* object extent? */
-        ceph_calc_file_object_mapping(layout, off, plen, &bno,
-                                      &objoff, &objlen);
-        if (*plen < orig_len)
-                dout(" skipping last %llu, final file extent %llu~%llu\n",
-                     orig_len - *plen, off, *plen);
-        sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
-        req->r_oid_len = strlen(req->r_oid);
-        op->extent.offset = cpu_to_le64(objoff);
-        op->extent.length = cpu_to_le64(objlen);
-        req->r_num_pages = calc_pages_for(off, *plen);
-        dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
-             req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
-}
-/*
- * requests
- */
-void ceph_osdc_release_request(struct kref *kref)
-{
-        struct ceph_osd_request *req = container_of(kref,
-                                                    struct ceph_osd_request,
-                                                    r_kref);
-        if (req->r_request)
-                ceph_msg_put(req->r_request);
-        if (req->r_reply)
-                ceph_msg_put(req->r_reply);
-        if (req->r_con_filling_msg) {
-                dout("release_request revoking pages %p from con %p\n",
-                     req->r_pages, req->r_con_filling_msg);
-                ceph_con_revoke_message(req->r_con_filling_msg,
-                                      req->r_reply);
-                ceph_con_put(req->r_con_filling_msg);
-        }
-        if (req->r_own_pages)
-                ceph_release_page_vector(req->r_pages,
-                                         req->r_num_pages);
-        ceph_put_snap_context(req->r_snapc);
-        if (req->r_mempool)
-                mempool_free(req, req->r_osdc->req_mempool);
-        else
-                kfree(req);
-}
-/*
- * build new request AND message, calculate layout, and adjust file
- * extent as needed.
- *
- * if the file was recently truncated, we include information about its
- * old and new size so that the object can be updated appropriately.  (we
- * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
- */
-struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
-                                               struct ceph_file_layout *layout,
-                                               struct ceph_vino vino,
-                                               u64 off, u64 *plen,
-                                               int opcode, int flags,
-                                               struct ceph_snap_context *snapc,
-                                               int do_sync,
-                                               u32 truncate_seq,
-                                               u64 truncate_size,
-                                               struct timespec *mtime,
-                                               bool use_mempool, int num_reply)
-{
-        struct ceph_osd_request *req;
-        struct ceph_msg *msg;
-        struct ceph_osd_request_head *head;
-        struct ceph_osd_op *op;
-        void *p;
-        int num_op = 1 + do_sync;
-        size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
-        int i;
-        if (use_mempool) {
-                req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
-                memset(req, 0, sizeof(*req));
-        } else {
-                req = kzalloc(sizeof(*req), GFP_NOFS);
-        }
-        if (req == NULL)
-                return NULL;
-        req->r_osdc = osdc;
-        req->r_mempool = use_mempool;
-        kref_init(&req->r_kref);
-        init_completion(&req->r_completion);
-        init_completion(&req->r_safe_completion);
-        INIT_LIST_HEAD(&req->r_unsafe_item);
-        req->r_flags = flags;
-        WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-        /* create reply message */
-        if (use_mempool)
-                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-                                   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
-        if (!msg) {
-                ceph_osdc_put_request(req);
-                return NULL;
-        }
-        req->r_reply = msg;
-        /* create request message; allow space for oid */
-        msg_size += 40;
-        if (snapc)
-                msg_size += sizeof(u64) * snapc->num_snaps;
-        if (use_mempool)
-                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
-        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
-        if (!msg) {
-                ceph_osdc_put_request(req);
-                return NULL;
-        }
-        msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
-        memset(msg->front.iov_base, 0, msg->front.iov_len);
-        head = msg->front.iov_base;
-        op = (void *)(head + 1);
-        p = (void *)(op + num_op);
-        req->r_request = msg;
-        req->r_snapc = ceph_get_snap_context(snapc);
-        head->client_inc = cpu_to_le32(1); /* always, for now. */
-        head->flags = cpu_to_le32(flags);
-        if (flags & CEPH_OSD_FLAG_WRITE)
-                ceph_encode_timespec(&head->mtime, mtime);
-        head->num_ops = cpu_to_le16(num_op);
-        op->op = cpu_to_le16(opcode);
-        /* calculate max write size */
-        calc_layout(osdc, vino, layout, off, plen, req);
-        req->r_file_layout = *layout;  /* keep a copy */
-        if (flags & CEPH_OSD_FLAG_WRITE) {
-                req->r_request->hdr.data_off = cpu_to_le16(off);
-                req->r_request->hdr.data_len = cpu_to_le32(*plen);
-                op->payload_len = cpu_to_le32(*plen);
-        }
-        op->extent.truncate_size = cpu_to_le64(truncate_size);
-        op->extent.truncate_seq = cpu_to_le32(truncate_seq);
-        /* fill in oid */
-        head->object_len = cpu_to_le32(req->r_oid_len);
-        memcpy(p, req->r_oid, req->r_oid_len);
-        p += req->r_oid_len;
-        if (do_sync) {
-                op++;
-                op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
-        }
-        if (snapc) {
-                head->snap_seq = cpu_to_le64(snapc->seq);
-                head->num_snaps = cpu_to_le32(snapc->num_snaps);
-                for (i = 0; i < snapc->num_snaps; i++) {
-                        put_unaligned_le64(snapc->snaps[i], p);
-                        p += sizeof(u64);
-                }
-        }
-        BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-        msg_size = p - msg->front.iov_base;
-        msg->front.iov_len = msg_size;
-        msg->hdr.front_len = cpu_to_le32(msg_size);
-        return req;
-}
-/*
- * We keep osd requests in an rbtree, sorted by ->r_tid.
- */
-static void __insert_request(struct ceph_osd_client *osdc,
-                             struct ceph_osd_request *new)
-{
-        struct rb_node **p = &osdc->requests.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_osd_request *req = NULL;
-        while (*p) {
-                parent = *p;
-                req = rb_entry(parent, struct ceph_osd_request, r_node);
-                if (new->r_tid < req->r_tid)
-                        p = &(*p)->rb_left;
-                else if (new->r_tid > req->r_tid)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->r_node, parent, p);
-        rb_insert_color(&new->r_node, &osdc->requests);
-}
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-                                                 u64 tid)
-{
-        struct ceph_osd_request *req;
-        struct rb_node *n = osdc->requests.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_osd_request, r_node);
-                if (tid < req->r_tid)
-                        n = n->rb_left;
-                else if (tid > req->r_tid)
-                        n = n->rb_right;
-                else
-                        return req;
-        }
-        return NULL;
-}
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-                    u64 tid)
-{
-        struct ceph_osd_request *req;
-        struct rb_node *n = osdc->requests.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_osd_request, r_node);
-                if (tid < req->r_tid) {
-                        if (!n->rb_left)
-                                return req;
-                        n = n->rb_left;
-                } else if (tid > req->r_tid) {
-                        n = n->rb_right;
-                } else {
-                        return req;
-                }
-        }
-        return NULL;
-}
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
-{
-        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc;
-        if (!osd)
-                return;
-        dout("osd_reset osd%d\n", osd->o_osd);
-        osdc = osd->o_osdc;
-        down_read(&osdc->map_sem);
-        kick_requests(osdc, osd);
-        up_read(&osdc->map_sem);
-}
-/*
- * Track open sessions with osds.
- */
-static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
-{
-        struct ceph_osd *osd;
-        osd = kzalloc(sizeof(*osd), GFP_NOFS);
-        if (!osd)
-                return NULL;
-        atomic_set(&osd->o_ref, 1);
-        osd->o_osdc = osdc;
-        INIT_LIST_HEAD(&osd->o_requests);
-        INIT_LIST_HEAD(&osd->o_osd_lru);
-        osd->o_incarnation = 1;
-        ceph_con_init(osdc->client->msgr, &osd->o_con);
-        osd->o_con.private = osd;
-        osd->o_con.ops = &osd_con_ops;
-        osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
-        INIT_LIST_HEAD(&osd->o_keepalive_item);
-        return osd;
-}
-static struct ceph_osd *get_osd(struct ceph_osd *osd)
-{
-        if (atomic_inc_not_zero(&osd->o_ref)) {
-                dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
-                     atomic_read(&osd->o_ref));
-                return osd;
-        } else {
-                dout("get_osd %p FAIL\n", osd);
-                return NULL;
-        }
-}
-static void put_osd(struct ceph_osd *osd)
-{
-        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
-             atomic_read(&osd->o_ref) - 1);
-        if (atomic_dec_and_test(&osd->o_ref)) {
-                struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
-                if (osd->o_authorizer)
-                        ac->ops->destroy_authorizer(ac, osd->o_authorizer);
-                kfree(osd);
-        }
-}
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-        dout("__remove_osd %p\n", osd);
-        BUG_ON(!list_empty(&osd->o_requests));
-        rb_erase(&osd->o_node, &osdc->osds);
-        list_del_init(&osd->o_osd_lru);
-        ceph_con_close(&osd->o_con);
-        put_osd(osd);
-}
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
-                              struct ceph_osd *osd)
-{
-        dout("__move_osd_to_lru %p\n", osd);
-        BUG_ON(!list_empty(&osd->o_osd_lru));
-        list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-        osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
-}
-static void __remove_osd_from_lru(struct ceph_osd *osd)
-{
-        dout("__remove_osd_from_lru %p\n", osd);
-        if (!list_empty(&osd->o_osd_lru))
-                list_del_init(&osd->o_osd_lru);
-}
-static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
-{
-        struct ceph_osd *osd, *nosd;
-        dout("__remove_old_osds %p\n", osdc);
-        mutex_lock(&osdc->request_mutex);
-        list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-                if (!remove_all && time_before(jiffies, osd->lru_ttl))
-                        break;
-                __remove_osd(osdc, osd);
-        }
-        mutex_unlock(&osdc->request_mutex);
-}
-/*
- * reset osd connect
- */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-        struct ceph_osd_request *req;
-        int ret = 0;
-        dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-        if (list_empty(&osd->o_requests)) {
-                __remove_osd(osdc, osd);
-        } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
-                          &osd->o_con.peer_addr,
-                          sizeof(osd->o_con.peer_addr)) == 0 &&
-                   !ceph_con_opened(&osd->o_con)) {
-                dout(" osd addr hasn't changed and connection never opened,"
-                     " letting msgr retry");
-                /* touch each r_stamp for handle_timeout()'s benfit */
-                list_for_each_entry(req, &osd->o_requests, r_osd_item)
-                        req->r_stamp = jiffies;
-                ret = -EAGAIN;
-        } else {
-                ceph_con_close(&osd->o_con);
-                ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
-                osd->o_incarnation++;
-        }
-        return ret;
-}
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-{
-        struct rb_node **p = &osdc->osds.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_osd *osd = NULL;
-        while (*p) {
-                parent = *p;
-                osd = rb_entry(parent, struct ceph_osd, o_node);
-                if (new->o_osd < osd->o_osd)
-                        p = &(*p)->rb_left;
-                else if (new->o_osd > osd->o_osd)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->o_node, parent, p);
-        rb_insert_color(&new->o_node, &osdc->osds);
-}
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
-{
-        struct ceph_osd *osd;
-        struct rb_node *n = osdc->osds.rb_node;
-        while (n) {
-                osd = rb_entry(n, struct ceph_osd, o_node);
-                if (o < osd->o_osd)
-                        n = n->rb_left;
-                else if (o > osd->o_osd)
-                        n = n->rb_right;
-                else
-                        return osd;
-        }
-        return NULL;
-}
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
-{
-        schedule_delayed_work(&osdc->timeout_work,
-                        osdc->client->mount_args->osd_keepalive_timeout * HZ);
-}
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
-{
-        cancel_delayed_work(&osdc->timeout_work);
-}
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void register_request(struct ceph_osd_client *osdc,
-                             struct ceph_osd_request *req)
-{
-        mutex_lock(&osdc->request_mutex);
-        req->r_tid = ++osdc->last_tid;
-        req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-        INIT_LIST_HEAD(&req->r_req_lru_item);
-        dout("register_request %p tid %lld\n", req, req->r_tid);
-        __insert_request(osdc, req);
-        ceph_osdc_get_request(req);
-        osdc->num_requests++;
-        if (osdc->num_requests == 1) {
-                dout(" first request, scheduling timeout\n");
-                __schedule_osd_timeout(osdc);
-        }
-        mutex_unlock(&osdc->request_mutex);
-}
-/*
- * called under osdc->request_mutex
- */
-static void __unregister_request(struct ceph_osd_client *osdc,
-                                 struct ceph_osd_request *req)
-{
-        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-        rb_erase(&req->r_node, &osdc->requests);
-        osdc->num_requests--;
-        if (req->r_osd) {
-                /* make sure the original request isn't in flight. */
-                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-                list_del_init(&req->r_osd_item);
-                if (list_empty(&req->r_osd->o_requests))
-                        __move_osd_to_lru(osdc, req->r_osd);
-                req->r_osd = NULL;
-        }
-        ceph_osdc_put_request(req);
-        list_del_init(&req->r_req_lru_item);
-        if (osdc->num_requests == 0) {
-                dout(" no requests, canceling timeout\n");
-                __cancel_osd_timeout(osdc);
-        }
-}
-/*
- * Cancel a previously queued request message
- */
-static void __cancel_request(struct ceph_osd_request *req)
-{
-        if (req->r_sent && req->r_osd) {
-                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-                req->r_sent = 0;
-        }
-        list_del_init(&req->r_req_lru_item);
-}
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_osds(struct ceph_osd_client *osdc,
-                      struct ceph_osd_request *req)
-{
-        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-        struct ceph_pg pgid;
-        int acting[CEPH_PG_MAX_SIZE];
-        int o = -1, num = 0;
-        int err;
-        dout("map_osds %p tid %lld\n", req, req->r_tid);
-        err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
-                                      &req->r_file_layout, osdc->osdmap);
-        if (err)
-                return err;
-        pgid = reqhead->layout.ol_pgid;
-        req->r_pgid = pgid;
-        err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
-        if (err > 0) {
-                o = acting[0];
-                num = err;
-        }
-        if ((req->r_osd && req->r_osd->o_osd == o &&
-             req->r_sent >= req->r_osd->o_incarnation &&
-             req->r_num_pg_osds == num &&
-             memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-            (req->r_osd == NULL && o == -1))
-                return 0;  /* no change */
-        dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
-             req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
-             req->r_osd ? req->r_osd->o_osd : -1);
-        /* record full pg acting set */
-        memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-        req->r_num_pg_osds = num;
-        if (req->r_osd) {
-                __cancel_request(req);
-                list_del_init(&req->r_osd_item);
-                req->r_osd = NULL;
-        }
-        req->r_osd = __lookup_osd(osdc, o);
-        if (!req->r_osd && o >= 0) {
-                err = -ENOMEM;
-                req->r_osd = create_osd(osdc);
-                if (!req->r_osd)
-                        goto out;
-                dout("map_osds osd %p is osd%d\n", req->r_osd, o);
-                req->r_osd->o_osd = o;
-                req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
-                __insert_osd(osdc, req->r_osd);
-                ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
-        }
-        if (req->r_osd) {
-                __remove_osd_from_lru(req->r_osd);
-                list_add(&req->r_osd_item, &req->r_osd->o_requests);
-        }
-        err = 1;   /* osd or pg changed */
-out:
-        return err;
-}
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static int __send_request(struct ceph_osd_client *osdc,
-                          struct ceph_osd_request *req)
-{
-        struct ceph_osd_request_head *reqhead;
-        int err;
-        err = __map_osds(osdc, req);
-        if (err < 0)
-                return err;
-        if (req->r_osd == NULL) {
-                dout("send_request %p no up osds in pg\n", req);
-                ceph_monc_request_next_osdmap(&osdc->client->monc);
-                return 0;
-        }
-        dout("send_request %p tid %llu to osd%d flags %d\n",
-             req, req->r_tid, req->r_osd->o_osd, req->r_flags);
-        reqhead = req->r_request->front.iov_base;
-        reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
-        reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
-        reqhead->reassert_version = req->r_reassert_version;
-        req->r_stamp = jiffies;
-        list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
-        ceph_msg_get(req->r_request); /* send consumes a ref */
-        ceph_con_send(&req->r_osd->o_con, req->r_request);
-        req->r_sent = req->r_osd->o_incarnation;
-        return 0;
-}
-/*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds.  When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected.  Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
- */
-static void handle_timeout(struct work_struct *work)
-{
-        struct ceph_osd_client *osdc =
-                container_of(work, struct ceph_osd_client, timeout_work.work);
-        struct ceph_osd_request *req, *last_req = NULL;
-        struct ceph_osd *osd;
-        unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
-        unsigned long keepalive =
-                osdc->client->mount_args->osd_keepalive_timeout * HZ;
-        unsigned long last_stamp = 0;
-        struct rb_node *p;
-        struct list_head slow_osds;
-        dout("timeout\n");
-        down_read(&osdc->map_sem);
-        ceph_monc_request_next_osdmap(&osdc->client->monc);
-        mutex_lock(&osdc->request_mutex);
-        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-                req = rb_entry(p, struct ceph_osd_request, r_node);
-                if (req->r_resend) {
-                        int err;
-                        dout("osdc resending prev failed %lld\n", req->r_tid);
-                        err = __send_request(osdc, req);
-                        if (err)
-                                dout("osdc failed again on %lld\n", req->r_tid);
-                        else
-                                req->r_resend = false;
-                        continue;
-                }
-        }
-        /*
-         * reset osds that appear to be _really_ unresponsive.  this
-         * is a failsafe measure.. we really shouldn't be getting to
-         * this point if the system is working properly.  the monitors
-         * should mark the osd as failed and we should find out about
-         * it from an updated osd map.
-         */
-        while (timeout && !list_empty(&osdc->req_lru)) {
-                req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
-                                 r_req_lru_item);
-                if (time_before(jiffies, req->r_stamp + timeout))
-                        break;
-                BUG_ON(req == last_req && req->r_stamp == last_stamp);
-                last_req = req;
-                last_stamp = req->r_stamp;
-                osd = req->r_osd;
-                BUG_ON(!osd);
-                pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
-                           req->r_tid, osd->o_osd);
-                __kick_requests(osdc, osd);
-        }
-        /*
-         * ping osds that are a bit slow.  this ensures that if there
-         * is a break in the TCP connection we will notice, and reopen
-         * a connection with that osd (from the fault callback).
-         */
-        INIT_LIST_HEAD(&slow_osds);
-        list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-                if (time_before(jiffies, req->r_stamp + keepalive))
-                        break;
-                osd = req->r_osd;
-                BUG_ON(!osd);
-                dout(" tid %llu is slow, will send keepalive on osd%d\n",
-                     req->r_tid, osd->o_osd);
-                list_move_tail(&osd->o_keepalive_item, &slow_osds);
-        }
-        while (!list_empty(&slow_osds)) {
-                osd = list_entry(slow_osds.next, struct ceph_osd,
-                                 o_keepalive_item);
-                list_del_init(&osd->o_keepalive_item);
-                ceph_con_keepalive(&osd->o_con);
-        }
-        __schedule_osd_timeout(osdc);
-        mutex_unlock(&osdc->request_mutex);
-        up_read(&osdc->map_sem);
-}
-static void handle_osds_timeout(struct work_struct *work)
-{
-        struct ceph_osd_client *osdc =
-                container_of(work, struct ceph_osd_client,
-                             osds_timeout_work.work);
-        unsigned long delay =
-                osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
-        dout("osds timeout\n");
-        down_read(&osdc->map_sem);
-        remove_old_osds(osdc, 0);
-        up_read(&osdc->map_sem);
-        schedule_delayed_work(&osdc->osds_timeout_work,
-                              round_jiffies_relative(delay));
-}
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
-                         struct ceph_connection *con)
-{
-        struct ceph_osd_reply_head *rhead = msg->front.iov_base;
-        struct ceph_osd_request *req;
-        u64 tid;
-        int numops, object_len, flags;
-        s32 result;
-        tid = le64_to_cpu(msg->hdr.tid);
-        if (msg->front.iov_len < sizeof(*rhead))
-                goto bad;
-        numops = le32_to_cpu(rhead->num_ops);
-        object_len = le32_to_cpu(rhead->object_len);
-        result = le32_to_cpu(rhead->result);
-        if (msg->front.iov_len != sizeof(*rhead) + object_len +
-            numops * sizeof(struct ceph_osd_op))
-                goto bad;
-        dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
-        /* lookup */
-        mutex_lock(&osdc->request_mutex);
-        req = __lookup_request(osdc, tid);
-        if (req == NULL) {
-                dout("handle_reply tid %llu dne\n", tid);
-                mutex_unlock(&osdc->request_mutex);
-                return;
-        }
-        ceph_osdc_get_request(req);
-        flags = le32_to_cpu(rhead->flags);
-        /*
-         * if this connection filled our message, drop our reference now, to
-         * avoid a (safe but slower) revoke later.
-         */
-        if (req->r_con_filling_msg == con && req->r_reply == msg) {
-                dout(" dropping con_filling_msg ref %p\n", con);
-                req->r_con_filling_msg = NULL;
-                ceph_con_put(con);
-        }
-        if (!req->r_got_reply) {
-                unsigned bytes;
-                req->r_result = le32_to_cpu(rhead->result);
-                bytes = le32_to_cpu(msg->hdr.data_len);
-                dout("handle_reply result %d bytes %d\n", req->r_result,
-                     bytes);
-                if (req->r_result == 0)
-                        req->r_result = bytes;
-                /* in case this is a write and we need to replay, */
-                req->r_reassert_version = rhead->reassert_version;
-                req->r_got_reply = 1;
-        } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-                dout("handle_reply tid %llu dup ack\n", tid);
-                mutex_unlock(&osdc->request_mutex);
-                goto done;
-        }
-        dout("handle_reply tid %llu flags %d\n", tid, flags);
-        /* either this is a read, or we got the safe response */
-        if (result < 0 ||
-            (flags & CEPH_OSD_FLAG_ONDISK) ||
-            ((flags & CEPH_OSD_FLAG_WRITE) == 0))
-                __unregister_request(osdc, req);
-        mutex_unlock(&osdc->request_mutex);
-        if (req->r_callback)
-                req->r_callback(req, msg);
-        else
-                complete_all(&req->r_completion);
-        if (flags & CEPH_OSD_FLAG_ONDISK) {
-                if (req->r_safe_callback)
-                        req->r_safe_callback(req, msg);
-                complete_all(&req->r_safe_completion);  /* fsync waiter */
-        }
-done:
-        ceph_osdc_put_request(req);
-        return;
-bad:
-        pr_err("corrupt osd_op_reply got %d %d expected %d\n",
-               (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
-               (int)sizeof(*rhead));
-        ceph_msg_dump(msg);
-}
-static int __kick_requests(struct ceph_osd_client *osdc,
-                          struct ceph_osd *kickosd)
-{
-        struct ceph_osd_request *req;
-        struct rb_node *p, *n;
-        int needmap = 0;
-        int err;
-        dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
-        if (kickosd) {
-                err = __reset_osd(osdc, kickosd);
-                if (err == -EAGAIN)
-                        return 1;
-        } else {
-                for (p = rb_first(&osdc->osds); p; p = n) {
-                        struct ceph_osd *osd =
-                                rb_entry(p, struct ceph_osd, o_node);
-                        n = rb_next(p);
-                        if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-                            memcmp(&osd->o_con.peer_addr,
-                                   ceph_osd_addr(osdc->osdmap,
-                                                 osd->o_osd),
-                                   sizeof(struct ceph_entity_addr)) != 0)
-                                __reset_osd(osdc, osd);
-                }
-        }
-        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-                req = rb_entry(p, struct ceph_osd_request, r_node);
-                if (req->r_resend) {
-                        dout(" r_resend set on tid %llu\n", req->r_tid);
-                        __cancel_request(req);
-                        goto kick;
-                }
-                if (req->r_osd && kickosd == req->r_osd) {
-                        __cancel_request(req);
-                        goto kick;
-                }
-                err = __map_osds(osdc, req);
-                if (err == 0)
-                        continue;  /* no change */
-                if (err < 0) {
-                        /*
-                         * FIXME: really, we should set the request
-                         * error and fail if this isn't a 'nofail'
-                         * request, but that's a fair bit more
-                         * complicated to do.  So retry!
-                         */
-                        dout(" setting r_resend on %llu\n", req->r_tid);
-                        req->r_resend = true;
-                        continue;
-                }
-                if (req->r_osd == NULL) {
-                        dout("tid %llu maps to no valid osd\n", req->r_tid);
-                        needmap++;  /* request a newer map */
-                        continue;
-                }
-kick:
-                dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
-                     req->r_osd ? req->r_osd->o_osd : -1);
-                req->r_flags |= CEPH_OSD_FLAG_RETRY;
-                err = __send_request(osdc, req);
-                if (err) {
-                        dout(" setting r_resend on %llu\n", req->r_tid);
-                        req->r_resend = true;
-                }
-        }
-        return needmap;
-}
-/*
- * Resubmit osd requests whose osd or osd address has changed.  Request
- * a new osd map if osds are down, or we are otherwise unable to determine
- * how to direct a request.
- *
- * Close connections to down osds.
- *
- * If @who is specified, resubmit requests for that specific osd.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static void kick_requests(struct ceph_osd_client *osdc,
-                          struct ceph_osd *kickosd)
-{
-        int needmap;
-        mutex_lock(&osdc->request_mutex);
-        needmap = __kick_requests(osdc, kickosd);
-        mutex_unlock(&osdc->request_mutex);
-        if (needmap) {
-                dout("%d requests for down osds, need new map\n", needmap);
-                ceph_monc_request_next_osdmap(&osdc->client->monc);
-        }
-}
-/*
- * Process updated osd map.
- *
- * The message contains any number of incremental and full maps, normally
- * indicating some sort of topology change in the cluster.  Kick requests
- * off to different OSDs as needed.
- */
-void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
-{
-        void *p, *end, *next;
-        u32 nr_maps, maplen;
-        u32 epoch;
-        struct ceph_osdmap *newmap = NULL, *oldmap;
-        int err;
-        struct ceph_fsid fsid;
-        dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
-        p = msg->front.iov_base;
-        end = p + msg->front.iov_len;
-        /* verify fsid */
-        ceph_decode_need(&p, end, sizeof(fsid), bad);
-        ceph_decode_copy(&p, &fsid, sizeof(fsid));
-        if (ceph_check_fsid(osdc->client, &fsid) < 0)
-                return;
-        down_write(&osdc->map_sem);
-        /* incremental maps */
-        ceph_decode_32_safe(&p, end, nr_maps, bad);
-        dout(" %d inc maps\n", nr_maps);
-        while (nr_maps > 0) {
-                ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-                epoch = ceph_decode_32(&p);
-                maplen = ceph_decode_32(&p);
-                ceph_decode_need(&p, end, maplen, bad);
-                next = p + maplen;
-                if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
-                        dout("applying incremental map %u len %d\n",
-                             epoch, maplen);
-                        newmap = osdmap_apply_incremental(&p, next,
-                                                          osdc->osdmap,
-                                                          osdc->client->msgr);
-                        if (IS_ERR(newmap)) {
-                                err = PTR_ERR(newmap);
-                                goto bad;
-                        }
-                        BUG_ON(!newmap);
-                        if (newmap != osdc->osdmap) {
-                                ceph_osdmap_destroy(osdc->osdmap);
-                                osdc->osdmap = newmap;
-                        }
-                } else {
-                        dout("ignoring incremental map %u len %d\n",
-                             epoch, maplen);
-                }
-                p = next;
-                nr_maps--;
-        }
-        if (newmap)
-                goto done;
-        /* full maps */
-        ceph_decode_32_safe(&p, end, nr_maps, bad);
-        dout(" %d full maps\n", nr_maps);
-        while (nr_maps) {
-                ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-                epoch = ceph_decode_32(&p);
-                maplen = ceph_decode_32(&p);
-                ceph_decode_need(&p, end, maplen, bad);
-                if (nr_maps > 1) {
-                        dout("skipping non-latest full map %u len %d\n",
-                             epoch, maplen);
-                } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
-                        dout("skipping full map %u len %d, "
-                             "older than our %u\n", epoch, maplen,
-                             osdc->osdmap->epoch);
-                } else {
-                        dout("taking full map %u len %d\n", epoch, maplen);
-                        newmap = osdmap_decode(&p, p+maplen);
-                        if (IS_ERR(newmap)) {
-                                err = PTR_ERR(newmap);
-                                goto bad;
-                        }
-                        BUG_ON(!newmap);
-                        oldmap = osdc->osdmap;
-                        osdc->osdmap = newmap;
-                        if (oldmap)
-                                ceph_osdmap_destroy(oldmap);
-                }
-                p += maplen;
-                nr_maps--;
-        }
-done:
-        downgrade_write(&osdc->map_sem);
-        ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
-        if (newmap)
-                kick_requests(osdc, NULL);
-        up_read(&osdc->map_sem);
-        wake_up_all(&osdc->client->auth_wq);
-        return;
-bad:
-        pr_err("osdc handle_map corrupt msg\n");
-        ceph_msg_dump(msg);
-        up_write(&osdc->map_sem);
-        return;
-}
-/*
- * Register request, send initial attempt.
- */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-                            struct ceph_osd_request *req,
-                            bool nofail)
-{
-        int rc = 0;
-        req->r_request->pages = req->r_pages;
-        req->r_request->nr_pages = req->r_num_pages;
-        register_request(osdc, req);
-        down_read(&osdc->map_sem);
-        mutex_lock(&osdc->request_mutex);
-        /*
-         * a racing kick_requests() may have sent the message for us
-         * while we dropped request_mutex above, so only send now if
-         * the request still han't been touched yet.
-         */
-        if (req->r_sent == 0) {
-                rc = __send_request(osdc, req);
-                if (rc) {
-                        if (nofail) {
-                                dout("osdc_start_request failed send, "
-                                     " marking %lld\n", req->r_tid);
-                                req->r_resend = true;
-                                rc = 0;
-                        } else {
-                                __unregister_request(osdc, req);
-                        }
-                }
-        }
-        mutex_unlock(&osdc->request_mutex);
-        up_read(&osdc->map_sem);
-        return rc;
-}
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-                           struct ceph_osd_request *req)
-{
-        int rc;
-        rc = wait_for_completion_interruptible(&req->r_completion);
-        if (rc < 0) {
-                mutex_lock(&osdc->request_mutex);
-                __cancel_request(req);
-                __unregister_request(osdc, req);
-                mutex_unlock(&osdc->request_mutex);
-                dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
-                return rc;
-        }
-        dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
-        return req->r_result;
-}
-/*
- * sync - wait for all in-flight requests to flush.  avoid starvation.
- */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
-{
-        struct ceph_osd_request *req;
-        u64 last_tid, next_tid = 0;
-        mutex_lock(&osdc->request_mutex);
-        last_tid = osdc->last_tid;
-        while (1) {
-                req = __lookup_request_ge(osdc, next_tid);
-                if (!req)
-                        break;
-                if (req->r_tid > last_tid)
-                        break;
-                next_tid = req->r_tid + 1;
-                if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-                        continue;
-                ceph_osdc_get_request(req);
-                mutex_unlock(&osdc->request_mutex);
-                dout("sync waiting on tid %llu (last is %llu)\n",
-                     req->r_tid, last_tid);
-                wait_for_completion(&req->r_safe_completion);
-                mutex_lock(&osdc->request_mutex);
-                ceph_osdc_put_request(req);
-        }
-        mutex_unlock(&osdc->request_mutex);
-        dout("sync done (thru tid %llu)\n", last_tid);
-}
-/*
- * init, shutdown
- */
-int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
-{
-        int err;
-        dout("init\n");
-        osdc->client = client;
-        osdc->osdmap = NULL;
-        init_rwsem(&osdc->map_sem);
-        init_completion(&osdc->map_waiters);
-        osdc->last_requested_map = 0;
-        mutex_init(&osdc->request_mutex);
-        osdc->last_tid = 0;
-        osdc->osds = RB_ROOT;
-        INIT_LIST_HEAD(&osdc->osd_lru);
-        osdc->requests = RB_ROOT;
-        INIT_LIST_HEAD(&osdc->req_lru);
-        osdc->num_requests = 0;
-        INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
-        INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-        schedule_delayed_work(&osdc->osds_timeout_work,
-           round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
-        err = -ENOMEM;
-        osdc->req_mempool = mempool_create_kmalloc_pool(10,
-                                        sizeof(struct ceph_osd_request));
-        if (!osdc->req_mempool)
-                goto out;
-        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
-                                "osd_op");
-        if (err < 0)
-                goto out_mempool;
-        err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-                                OSD_OPREPLY_FRONT_LEN, 10, true,
-                                "osd_op_reply");
-        if (err < 0)
-                goto out_msgpool;
-        return 0;
-out_msgpool:
-        ceph_msgpool_destroy(&osdc->msgpool_op);
-out_mempool:
-        mempool_destroy(osdc->req_mempool);
-out:
-        return err;
-}
-void ceph_osdc_stop(struct ceph_osd_client *osdc)
-{
-        cancel_delayed_work_sync(&osdc->timeout_work);
-        cancel_delayed_work_sync(&osdc->osds_timeout_work);
-        if (osdc->osdmap) {
-                ceph_osdmap_destroy(osdc->osdmap);
-                osdc->osdmap = NULL;
-        }
-        remove_old_osds(osdc, 1);
-        mempool_destroy(osdc->req_mempool);
-        ceph_msgpool_destroy(&osdc->msgpool_op);
-        ceph_msgpool_destroy(&osdc->msgpool_op_reply);
-}
-/*
- * Read some contiguous pages.  If we cross a stripe boundary, shorten
- * *plen.  Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-                        struct ceph_vino vino, struct ceph_file_layout *layout,
-                        u64 off, u64 *plen,
-                        u32 truncate_seq, u64 truncate_size,
-                        struct page **pages, int num_pages)
-{
-        struct ceph_osd_request *req;
-        int rc = 0;
-        dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
-             vino.snap, off, *plen);
-        req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
-                                    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-                                    NULL, 0, truncate_seq, truncate_size, NULL,
-                                    false, 1);
-        if (!req)
-                return -ENOMEM;
-        /* it may be a short read due to an object boundary */
-        req->r_pages = pages;
-        dout("readpages  final extent is %llu~%llu (%d pages)\n",
-             off, *plen, req->r_num_pages);
-        rc = ceph_osdc_start_request(osdc, req, false);
-        if (!rc)
-                rc = ceph_osdc_wait_request(osdc, req);
-        ceph_osdc_put_request(req);
-        dout("readpages result %d\n", rc);
-        return rc;
-}
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
-                         struct ceph_file_layout *layout,
-                         struct ceph_snap_context *snapc,
-                         u64 off, u64 len,
-                         u32 truncate_seq, u64 truncate_size,
-                         struct timespec *mtime,
-                         struct page **pages, int num_pages,
-                         int flags, int do_sync, bool nofail)
-{
-        struct ceph_osd_request *req;
-        int rc = 0;
-        BUG_ON(vino.snap != CEPH_NOSNAP);
-        req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
-                                    CEPH_OSD_OP_WRITE,
-                                    flags | CEPH_OSD_FLAG_ONDISK |
-                                            CEPH_OSD_FLAG_WRITE,
-                                    snapc, do_sync,
-                                    truncate_seq, truncate_size, mtime,
-                                    nofail, 1);
-        if (!req)
-                return -ENOMEM;
-        /* it may be a short write due to an object boundary */
-        req->r_pages = pages;
-        dout("writepages %llu~%llu (%d pages)\n", off, len,
-             req->r_num_pages);
-        rc = ceph_osdc_start_request(osdc, req, nofail);
-        if (!rc)
-                rc = ceph_osdc_wait_request(osdc, req);
-        ceph_osdc_put_request(req);
-        if (rc == 0)
-                rc = len;
-        dout("writepages result %d\n", rc);
-        return rc;
-}
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc;
-        int type = le16_to_cpu(msg->hdr.type);
-        if (!osd)
-                goto out;
-        osdc = osd->o_osdc;
-        switch (type) {
-        case CEPH_MSG_OSD_MAP:
-                ceph_osdc_handle_map(osdc, msg);
-                break;
-        case CEPH_MSG_OSD_OPREPLY:
-                handle_reply(osdc, msg, con);
-                break;
-        default:
-                pr_err("received unknown message type %d %s\n", type,
-                       ceph_msg_type_name(type));
-        }
-out:
-        ceph_msg_put(msg);
-}
-/*
- * lookup and return message for incoming reply.  set up reply message
- * pages.
- */
-static struct ceph_msg *get_reply(struct ceph_connection *con,
-                                  struct ceph_msg_header *hdr,
-                                  int *skip)
-{
-        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc = osd->o_osdc;
-        struct ceph_msg *m;
-        struct ceph_osd_request *req;
-        int front = le32_to_cpu(hdr->front_len);
-        int data_len = le32_to_cpu(hdr->data_len);
-        u64 tid;
-        tid = le64_to_cpu(hdr->tid);
-        mutex_lock(&osdc->request_mutex);
-        req = __lookup_request(osdc, tid);
-        if (!req) {
-                *skip = 1;
-                m = NULL;
-                pr_info("get_reply unknown tid %llu from osd%d\n", tid,
-                        osd->o_osd);
-                goto out;
-        }
-        if (req->r_con_filling_msg) {
-                dout("get_reply revoking msg %p from old con %p\n",
-                     req->r_reply, req->r_con_filling_msg);
-                ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
-                ceph_con_put(req->r_con_filling_msg);
-                req->r_con_filling_msg = NULL;
-        }
-        if (front > req->r_reply->front.iov_len) {
-                pr_warning("get_reply front %d > preallocated %d\n",
-                           front, (int)req->r_reply->front.iov_len);
-                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
-                if (!m)
-                        goto out;
-                ceph_msg_put(req->r_reply);
-                req->r_reply = m;
-        }
-        m = ceph_msg_get(req->r_reply);
-        if (data_len > 0) {
-                unsigned data_off = le16_to_cpu(hdr->data_off);
-                int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-                if (unlikely(req->r_num_pages < want)) {
-                        pr_warning("tid %lld reply %d > expected %d pages\n",
-                                   tid, want, m->nr_pages);
-                        *skip = 1;
-                        ceph_msg_put(m);
-                        m = NULL;
-                        goto out;
-                }
-                m->pages = req->r_pages;
-                m->nr_pages = req->r_num_pages;
-        }
-        *skip = 0;
-        req->r_con_filling_msg = ceph_con_get(con);
-        dout("get_reply tid %lld %p\n", tid, m);
-out:
-        mutex_unlock(&osdc->request_mutex);
-        return m;
-}
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
-                                  struct ceph_msg_header *hdr,
-                                  int *skip)
-{
-        struct ceph_osd *osd = con->private;
-        int type = le16_to_cpu(hdr->type);
-        int front = le32_to_cpu(hdr->front_len);
-        switch (type) {
-        case CEPH_MSG_OSD_MAP:
-                return ceph_msg_new(type, front, GFP_NOFS);
-        case CEPH_MSG_OSD_OPREPLY:
-                return get_reply(con, hdr, skip);
-        default:
-                pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-                        osd->o_osd);
-                *skip = 1;
-                return NULL;
-        }
-}
-/*
- * Wrappers to refcount containing ceph_osd struct
- */
-static struct ceph_connection *get_osd_con(struct ceph_connection *con)
-{
-        struct ceph_osd *osd = con->private;
-        if (get_osd(osd))
-                return con;
-        return NULL;
-}
-static void put_osd_con(struct ceph_connection *con)
-{
-        struct ceph_osd *osd = con->private;
-        put_osd(osd);
-}
-/*
- * authentication
- */
-static int get_authorizer(struct ceph_connection *con,
-                          void **buf, int *len, int *proto,
-                          void **reply_buf, int *reply_len, int force_new)
-{
-        struct ceph_osd *o = con->private;
-        struct ceph_osd_client *osdc = o->o_osdc;
-        struct ceph_auth_client *ac = osdc->client->monc.auth;
-        int ret = 0;
-        if (force_new && o->o_authorizer) {
-                ac->ops->destroy_authorizer(ac, o->o_authorizer);
-                o->o_authorizer = NULL;
-        }
-        if (o->o_authorizer == NULL) {
-                ret = ac->ops->create_authorizer(
-                        ac, CEPH_ENTITY_TYPE_OSD,
-                        &o->o_authorizer,
-                        &o->o_authorizer_buf,
-                        &o->o_authorizer_buf_len,
-                        &o->o_authorizer_reply_buf,
-                        &o->o_authorizer_reply_buf_len);
-                if (ret)
-                        return ret;
-        }
-        *proto = ac->protocol;
-        *buf = o->o_authorizer_buf;
-        *len = o->o_authorizer_buf_len;
-        *reply_buf = o->o_authorizer_reply_buf;
-        *reply_len = o->o_authorizer_reply_buf_len;
-        return 0;
-}
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
-{
-        struct ceph_osd *o = con->private;
-        struct ceph_osd_client *osdc = o->o_osdc;
-        struct ceph_auth_client *ac = osdc->client->monc.auth;
-        return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
-}
-static int invalidate_authorizer(struct ceph_connection *con)
-{
-        struct ceph_osd *o = con->private;
-        struct ceph_osd_client *osdc = o->o_osdc;
-        struct ceph_auth_client *ac = osdc->client->monc.auth;
-        if (ac->ops->invalidate_authorizer)
-                ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
-        return ceph_monc_validate_auth(&osdc->client->monc);
-}
-static const struct ceph_connection_operations osd_con_ops = {
-        .get = get_osd_con,
-        .put = put_osd_con,
-        .dispatch = dispatch,
-        .get_authorizer = get_authorizer,
-        .verify_authorizer_reply = verify_authorizer_reply,
-        .invalidate_authorizer = invalidate_authorizer,
-        .alloc_msg = alloc_msg,
-        .fault = osd_reset,
-};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
deleted file mode 100644
index ce776989ef6a..000000000000
--- a/fs/ceph/osd_client.h
+++ /dev/null
@@ -1,167 +0,0 @@
-#ifndef _FS_CEPH_OSD_CLIENT_H
-#define _FS_CEPH_OSD_CLIENT_H
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/mempool.h>
-#include <linux/rbtree.h>
-#include "types.h"
-#include "osdmap.h"
-#include "messenger.h"
-struct ceph_msg;
-struct ceph_snap_context;
-struct ceph_osd_request;
-struct ceph_osd_client;
-struct ceph_authorizer;
-/*
- * completion callback for async writepages
- */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
-                                     struct ceph_msg *);
-/* a given osd we're communicating with */
-struct ceph_osd {
-        atomic_t o_ref;
-        struct ceph_osd_client *o_osdc;
-        int o_osd;
-        int o_incarnation;
-        struct rb_node o_node;
-        struct ceph_connection o_con;
-        struct list_head o_requests;
-        struct list_head o_osd_lru;
-        struct ceph_authorizer *o_authorizer;
-        void *o_authorizer_buf, *o_authorizer_reply_buf;
-        size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
-        unsigned long lru_ttl;
-        int o_marked_for_keepalive;
-        struct list_head o_keepalive_item;
-};
-/* an in-flight request */
-struct ceph_osd_request {
-        u64             r_tid;              /* unique for this client */
-        struct rb_node  r_node;
-        struct list_head r_req_lru_item;
-        struct list_head r_osd_item;
-        struct ceph_osd *r_osd;
-        struct ceph_pg   r_pgid;
-        int              r_pg_osds[CEPH_PG_MAX_SIZE];
-        int              r_num_pg_osds;
-        struct ceph_connection *r_con_filling_msg;
-        struct ceph_msg  *r_request, *r_reply;
-        int               r_result;
-        int               r_flags;     /* any additional flags for the osd */
-        u32               r_sent;      /* >0 if r_request is sending/sent */
-        int               r_got_reply;
-        struct ceph_osd_client *r_osdc;
-        struct kref       r_kref;
-        bool              r_mempool;
-        struct completion r_completion, r_safe_completion;
-        ceph_osdc_callback_t r_callback, r_safe_callback;
-        struct ceph_eversion r_reassert_version;
-        struct list_head  r_unsafe_item;
-        struct inode *r_inode;                /* for use by callbacks */
-        char              r_oid[40];          /* object name */
-        int               r_oid_len;
-        unsigned long     r_stamp;            /* send OR check time */
-        bool              r_resend;           /* msg send failed, needs retry */
-        struct ceph_file_layout r_file_layout;
-        struct ceph_snap_context *r_snapc;    /* snap context for writes */
-        unsigned          r_num_pages;        /* size of page array (follows) */
-        struct page     **r_pages;            /* pages for data payload */
-        int               r_pages_from_pool;
-        int               r_own_pages;        /* if true, i own page list */
-};
-struct ceph_osd_client {
-        struct ceph_client     *client;
-        struct ceph_osdmap     *osdmap;       /* current map */
-        struct rw_semaphore    map_sem;
-        struct completion      map_waiters;
-        u64                    last_requested_map;
-        struct mutex           request_mutex;
-        struct rb_root         osds;          /* osds */
-        struct list_head       osd_lru;       /* idle osds */
-        u64                    timeout_tid;   /* tid of timeout triggering rq */
-        u64                    last_tid;      /* tid of last request */
-        struct rb_root         requests;      /* pending requests */
-        struct list_head       req_lru;       /* pending requests lru */
-        int                    num_requests;
-        struct delayed_work    timeout_work;
-        struct delayed_work    osds_timeout_work;
-#ifdef CONFIG_DEBUG_FS
-        struct dentry          *debugfs_file;
-#endif
-        mempool_t              *req_mempool;
-        struct ceph_msgpool     msgpool_op;
-        struct ceph_msgpool     msgpool_op_reply;
-};
-extern int ceph_osdc_init(struct ceph_osd_client *osdc,
-                          struct ceph_client *client);
-extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
-extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
-                                   struct ceph_msg *msg);
-extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
-                                 struct ceph_msg *msg);
-extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
-                                      struct ceph_file_layout *layout,
-                                      struct ceph_vino vino,
-                                      u64 offset, u64 *len, int op, int flags,
-                                      struct ceph_snap_context *snapc,
-                                      int do_sync, u32 truncate_seq,
-                                      u64 truncate_size,
-                                      struct timespec *mtime,
-                                      bool use_mempool, int num_reply);
-static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
-{
-        kref_get(&req->r_kref);
-}
-extern void ceph_osdc_release_request(struct kref *kref);
-static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
-{
-        kref_put(&req->r_kref, ceph_osdc_release_request);
-}
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-                                   struct ceph_osd_request *req,
-                                   bool nofail);
-extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-                                  struct ceph_osd_request *req);
-extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-                               struct ceph_vino vino,
-                               struct ceph_file_layout *layout,
-                               u64 off, u64 *plen,
-                               u32 truncate_seq, u64 truncate_size,
-                               struct page **pages, int nr_pages);
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
-                                struct ceph_vino vino,
-                                struct ceph_file_layout *layout,
-                                struct ceph_snap_context *sc,
-                                u64 off, u64 len,
-                                u32 truncate_seq, u64 truncate_size,
-                                struct timespec *mtime,
-                                struct page **pages, int nr_pages,
-                                int flags, int do_sync, bool nofail);
-#endif
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
deleted file mode 100644
index e31f118f1392..000000000000
--- a/fs/ceph/osdmap.c
+++ /dev/null
@@ -1,1110 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/slab.h>
-#include <asm/div64.h>
-#include "super.h"
-#include "osdmap.h"
-#include "crush/hash.h"
-#include "crush/mapper.h"
-#include "decode.h"
-char *ceph_osdmap_state_str(char *str, int len, int state)
-{
-        int flag = 0;
-        if (!len)
-                goto done;
-        *str = '\0';
-        if (state) {
-                if (state & CEPH_OSD_EXISTS) {
-                        snprintf(str, len, "exists");
-                        flag = 1;
-                }
-                if (state & CEPH_OSD_UP) {
-                        snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
-                                 "up");
-                        flag = 1;
-                }
-        } else {
-                snprintf(str, len, "doesn't exist");
-        }
-done:
-        return str;
-}
-/* maps */
-static int calc_bits_of(unsigned t)
-{
-        int b = 0;
-        while (t) {
-                t = t >> 1;
-                b++;
-        }
-        return b;
-}
-/*
- * the foo_mask is the smallest value 2^n-1 that is >= foo.
- */
-static void calc_pg_masks(struct ceph_pg_pool_info *pi)
-{
-        pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
-        pi->pgp_num_mask =
-                (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
-        pi->lpg_num_mask =
-                (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
-        pi->lpgp_num_mask =
-                (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
-}
-/*
- * decode crush map
- */
-static int crush_decode_uniform_bucket(void **p, void *end,
-                                       struct crush_bucket_uniform *b)
-{
-        dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
-        ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
-        b->item_weight = ceph_decode_32(p);
-        return 0;
-bad:
-        return -EINVAL;
-}
-static int crush_decode_list_bucket(void **p, void *end,
-                                    struct crush_bucket_list *b)
-{
-        int j;
-        dout("crush_decode_list_bucket %p to %p\n", *p, end);
-        b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-        if (b->item_weights == NULL)
-                return -ENOMEM;
-        b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-        if (b->sum_weights == NULL)
-                return -ENOMEM;
-        ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
-        for (j = 0; j < b->h.size; j++) {
-                b->item_weights[j] = ceph_decode_32(p);
-                b->sum_weights[j] = ceph_decode_32(p);
-        }
-        return 0;
-bad:
-        return -EINVAL;
-}
-static int crush_decode_tree_bucket(void **p, void *end,
-                                    struct crush_bucket_tree *b)
-{
-        int j;
-        dout("crush_decode_tree_bucket %p to %p\n", *p, end);
-        ceph_decode_32_safe(p, end, b->num_nodes, bad);
-        b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
-        if (b->node_weights == NULL)
-                return -ENOMEM;
-        ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
-        for (j = 0; j < b->num_nodes; j++)
-                b->node_weights[j] = ceph_decode_32(p);
-        return 0;
-bad:
-        return -EINVAL;
-}
-static int crush_decode_straw_bucket(void **p, void *end,
-                                     struct crush_bucket_straw *b)
-{
-        int j;
-        dout("crush_decode_straw_bucket %p to %p\n", *p, end);
-        b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-        if (b->item_weights == NULL)
-                return -ENOMEM;
-        b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-        if (b->straws == NULL)
-                return -ENOMEM;
-        ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
-        for (j = 0; j < b->h.size; j++) {
-                b->item_weights[j] = ceph_decode_32(p);
-                b->straws[j] = ceph_decode_32(p);
-        }
-        return 0;
-bad:
-        return -EINVAL;
-}
-static struct crush_map *crush_decode(void *pbyval, void *end)
-{
-        struct crush_map *c;
-        int err = -EINVAL;
-        int i, j;
-        void **p = &pbyval;
-        void *start = pbyval;
-        u32 magic;
-        dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-        c = kzalloc(sizeof(*c), GFP_NOFS);
-        if (c == NULL)
-                return ERR_PTR(-ENOMEM);
-        ceph_decode_need(p, end, 4*sizeof(u32), bad);
-        magic = ceph_decode_32(p);
-        if (magic != CRUSH_MAGIC) {
-                pr_err("crush_decode magic %x != current %x\n",
-                       (unsigned)magic, (unsigned)CRUSH_MAGIC);
-                goto bad;
-        }
-        c->max_buckets = ceph_decode_32(p);
-        c->max_rules = ceph_decode_32(p);
-        c->max_devices = ceph_decode_32(p);
-        c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
-        if (c->device_parents == NULL)
-                goto badmem;
-        c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
-        if (c->bucket_parents == NULL)
-                goto badmem;
-        c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
-        if (c->buckets == NULL)
-                goto badmem;
-        c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
-        if (c->rules == NULL)
-                goto badmem;
-        /* buckets */
-        for (i = 0; i < c->max_buckets; i++) {
-                int size = 0;
-                u32 alg;
-                struct crush_bucket *b;
-                ceph_decode_32_safe(p, end, alg, bad);
-                if (alg == 0) {
-                        c->buckets[i] = NULL;
-                        continue;
-                }
-                dout("crush_decode bucket %d off %x %p to %p\n",
-                     i, (int)(*p-start), *p, end);
-                switch (alg) {
-                case CRUSH_BUCKET_UNIFORM:
-                        size = sizeof(struct crush_bucket_uniform);
-                        break;
-                case CRUSH_BUCKET_LIST:
-                        size = sizeof(struct crush_bucket_list);
-                        break;
-                case CRUSH_BUCKET_TREE:
-                        size = sizeof(struct crush_bucket_tree);
-                        break;
-                case CRUSH_BUCKET_STRAW:
-                        size = sizeof(struct crush_bucket_straw);
-                        break;
-                default:
-                        err = -EINVAL;
-                        goto bad;
-                }
-                BUG_ON(size == 0);
-                b = c->buckets[i] = kzalloc(size, GFP_NOFS);
-                if (b == NULL)
-                        goto badmem;
-                ceph_decode_need(p, end, 4*sizeof(u32), bad);
-                b->id = ceph_decode_32(p);
-                b->type = ceph_decode_16(p);
-                b->alg = ceph_decode_8(p);
-                b->hash = ceph_decode_8(p);
-                b->weight = ceph_decode_32(p);
-                b->size = ceph_decode_32(p);
-                dout("crush_decode bucket size %d off %x %p to %p\n",
-                     b->size, (int)(*p-start), *p, end);
-                b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
-                if (b->items == NULL)
-                        goto badmem;
-                b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
-                if (b->perm == NULL)
-                        goto badmem;
-                b->perm_n = 0;
-                ceph_decode_need(p, end, b->size*sizeof(u32), bad);
-                for (j = 0; j < b->size; j++)
-                        b->items[j] = ceph_decode_32(p);
-                switch (b->alg) {
-                case CRUSH_BUCKET_UNIFORM:
-                        err = crush_decode_uniform_bucket(p, end,
-                                  (struct crush_bucket_uniform *)b);
-                        if (err < 0)
-                                goto bad;
-                        break;
-                case CRUSH_BUCKET_LIST:
-                        err = crush_decode_list_bucket(p, end,
-                               (struct crush_bucket_list *)b);
-                        if (err < 0)
-                                goto bad;
-                        break;
-                case CRUSH_BUCKET_TREE:
-                        err = crush_decode_tree_bucket(p, end,
-                                (struct crush_bucket_tree *)b);
-                        if (err < 0)
-                                goto bad;
-                        break;
-                case CRUSH_BUCKET_STRAW:
-                        err = crush_decode_straw_bucket(p, end,
-                                (struct crush_bucket_straw *)b);
-                        if (err < 0)
-                                goto bad;
-                        break;
-                }
-        }
-        /* rules */
-        dout("rule vec is %p\n", c->rules);
-        for (i = 0; i < c->max_rules; i++) {
-                u32 yes;
-                struct crush_rule *r;
-                ceph_decode_32_safe(p, end, yes, bad);
-                if (!yes) {
-                        dout("crush_decode NO rule %d off %x %p to %p\n",
-                             i, (int)(*p-start), *p, end);
-                        c->rules[i] = NULL;
-                        continue;
-                }
-                dout("crush_decode rule %d off %x %p to %p\n",
-                     i, (int)(*p-start), *p, end);
-                /* len */
-                ceph_decode_32_safe(p, end, yes, bad);
-#if BITS_PER_LONG == 32
-                err = -EINVAL;
-                if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
-                        goto bad;
-#endif
-                r = c->rules[i] = kmalloc(sizeof(*r) +
-                                          yes*sizeof(struct crush_rule_step),
-                                          GFP_NOFS);
-                if (r == NULL)
-                        goto badmem;
-                dout(" rule %d is at %p\n", i, r);
-                r->len = yes;
-                ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
-                ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
-                for (j = 0; j < r->len; j++) {
-                        r->steps[j].op = ceph_decode_32(p);
-                        r->steps[j].arg1 = ceph_decode_32(p);
-                        r->steps[j].arg2 = ceph_decode_32(p);
-                }
-        }
-        /* ignore trailing name maps. */
-        dout("crush_decode success\n");
-        return c;
-badmem:
-        err = -ENOMEM;
-bad:
-        dout("crush_decode fail %d\n", err);
-        crush_destroy(c);
-        return ERR_PTR(err);
-}
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
-{
-        u64 a = *(u64 *)&l;
-        u64 b = *(u64 *)&r;
-        if (a < b)
-                return -1;
-        if (a > b)
-                return 1;
-        return 0;
-}
-static int __insert_pg_mapping(struct ceph_pg_mapping *new,
-                               struct rb_root *root)
-{
-        struct rb_node **p = &root->rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_pg_mapping *pg = NULL;
-        int c;
-        while (*p) {
-                parent = *p;
-                pg = rb_entry(parent, struct ceph_pg_mapping, node);
-                c = pgid_cmp(new->pgid, pg->pgid);
-                if (c < 0)
-                        p = &(*p)->rb_left;
-                else if (c > 0)
-                        p = &(*p)->rb_right;
-                else
-                        return -EEXIST;
-        }
-        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, root);
-        return 0;
-}
-static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-                                                   struct ceph_pg pgid)
-{
-        struct rb_node *n = root->rb_node;
-        struct ceph_pg_mapping *pg;
-        int c;
-        while (n) {
-                pg = rb_entry(n, struct ceph_pg_mapping, node);
-                c = pgid_cmp(pgid, pg->pgid);
-                if (c < 0)
-                        n = n->rb_left;
-                else if (c > 0)
-                        n = n->rb_right;
-                else
-                        return pg;
-        }
-        return NULL;
-}
-/*
- * rbtree of pg pool info
- */
-static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
-{
-        struct rb_node **p = &root->rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_pg_pool_info *pi = NULL;
-        while (*p) {
-                parent = *p;
-                pi = rb_entry(parent, struct ceph_pg_pool_info, node);
-                if (new->id < pi->id)
-                        p = &(*p)->rb_left;
-                else if (new->id > pi->id)
-                        p = &(*p)->rb_right;
-                else
-                        return -EEXIST;
-        }
-        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, root);
-        return 0;
-}
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
-{
-        struct ceph_pg_pool_info *pi;
-        struct rb_node *n = root->rb_node;
-        while (n) {
-                pi = rb_entry(n, struct ceph_pg_pool_info, node);
-                if (id < pi->id)
-                        n = n->rb_left;
-                else if (id > pi->id)
-                        n = n->rb_right;
-                else
-                        return pi;
-        }
-        return NULL;
-}
-static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
-{
-        rb_erase(&pi->node, root);
-        kfree(pi->name);
-        kfree(pi);
-}
-static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
-{
-        unsigned n, m;
-        ceph_decode_copy(p, &pi->v, sizeof(pi->v));
-        calc_pg_masks(pi);
-        /* num_snaps * snap_info_t */
-        n = le32_to_cpu(pi->v.num_snaps);
-        while (n--) {
-                ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
-                                 sizeof(struct ceph_timespec), bad);
-                *p += sizeof(u64) +       /* key */
-                        1 + sizeof(u64) + /* u8, snapid */
-                        sizeof(struct ceph_timespec);
-                m = ceph_decode_32(p);    /* snap name */
-                *p += m;
-        }
-        *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
-        return 0;
-bad:
-        return -EINVAL;
-}
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
-{
-        struct ceph_pg_pool_info *pi;
-        u32 num, len, pool;
-        ceph_decode_32_safe(p, end, num, bad);
-        dout(" %d pool names\n", num);
-        while (num--) {
-                ceph_decode_32_safe(p, end, pool, bad);
-                ceph_decode_32_safe(p, end, len, bad);
-                dout("  pool %d len %d\n", pool, len);
-                pi = __lookup_pg_pool(&map->pg_pools, pool);
-                if (pi) {
-                        kfree(pi->name);
-                        pi->name = kmalloc(len + 1, GFP_NOFS);
-                        if (pi->name) {
-                                memcpy(pi->name, *p, len);
-                                pi->name[len] = '\0';
-                                dout("  name is %s\n", pi->name);
-                        }
-                }
-                *p += len;
-        }
-        return 0;
-bad:
-        return -EINVAL;
-}
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
-        dout("osdmap_destroy %p\n", map);
-        if (map->crush)
-                crush_destroy(map->crush);
-        while (!RB_EMPTY_ROOT(&map->pg_temp)) {
-                struct ceph_pg_mapping *pg =
-                        rb_entry(rb_first(&map->pg_temp),
-                                 struct ceph_pg_mapping, node);
-                rb_erase(&pg->node, &map->pg_temp);
-                kfree(pg);
-        }
-        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
-                struct ceph_pg_pool_info *pi =
-                        rb_entry(rb_first(&map->pg_pools),
-                                 struct ceph_pg_pool_info, node);
-                __remove_pg_pool(&map->pg_pools, pi);
-        }
-        kfree(map->osd_state);
-        kfree(map->osd_weight);
-        kfree(map->osd_addr);
-        kfree(map);
-}
-/*
- * adjust max osd value.  reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
-        u8 *state;
-        struct ceph_entity_addr *addr;
-        u32 *weight;
-        state = kcalloc(max, sizeof(*state), GFP_NOFS);
-        addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-        weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-        if (state == NULL || addr == NULL || weight == NULL) {
-                kfree(state);
-                kfree(addr);
-                kfree(weight);
-                return -ENOMEM;
-        }
-        /* copy old? */
-        if (map->osd_state) {
-                memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-                memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-                memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-                kfree(map->osd_state);
-                kfree(map->osd_addr);
-                kfree(map->osd_weight);
-        }
-        map->osd_state = state;
-        map->osd_weight = weight;
-        map->osd_addr = addr;
-        map->max_osd = max;
-        return 0;
-}
-/*
- * decode a full map.
- */
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
-{
-        struct ceph_osdmap *map;
-        u16 version;
-        u32 len, max, i;
-        u8 ev;
-        int err = -EINVAL;
-        void *start = *p;
-        struct ceph_pg_pool_info *pi;
-        dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-        map = kzalloc(sizeof(*map), GFP_NOFS);
-        if (map == NULL)
-                return ERR_PTR(-ENOMEM);
-        map->pg_temp = RB_ROOT;
-        ceph_decode_16_safe(p, end, version, bad);
-        if (version > CEPH_OSDMAP_VERSION) {
-                pr_warning("got unknown v %d > %d of osdmap\n", version,
-                           CEPH_OSDMAP_VERSION);
-                goto bad;
-        }
-        ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
-        ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
-        map->epoch = ceph_decode_32(p);
-        ceph_decode_copy(p, &map->created, sizeof(map->created));
-        ceph_decode_copy(p, &map->modified, sizeof(map->modified));
-        ceph_decode_32_safe(p, end, max, bad);
-        while (max--) {
-                ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
-                pi = kzalloc(sizeof(*pi), GFP_NOFS);
-                if (!pi)
-                        goto bad;
-                pi->id = ceph_decode_32(p);
-                ev = ceph_decode_8(p); /* encoding version */
-                if (ev > CEPH_PG_POOL_VERSION) {
-                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-                                   ev, CEPH_PG_POOL_VERSION);
-                        kfree(pi);
-                        goto bad;
-                }
-                err = __decode_pool(p, end, pi);
-                if (err < 0)
-                        goto bad;
-                __insert_pg_pool(&map->pg_pools, pi);
-        }
-        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-                goto bad;
-        ceph_decode_32_safe(p, end, map->pool_max, bad);
-        ceph_decode_32_safe(p, end, map->flags, bad);
-        max = ceph_decode_32(p);
-        /* (re)alloc osd arrays */
-        err = osdmap_set_max_osd(map, max);
-        if (err < 0)
-                goto bad;
-        dout("osdmap_decode max_osd = %d\n", map->max_osd);
-        /* osds */
-        err = -EINVAL;
-        ceph_decode_need(p, end, 3*sizeof(u32) +
-                         map->max_osd*(1 + sizeof(*map->osd_weight) +
-                                       sizeof(*map->osd_addr)), bad);
-        *p += 4; /* skip length field (should match max) */
-        ceph_decode_copy(p, map->osd_state, map->max_osd);
-        *p += 4; /* skip length field (should match max) */
-        for (i = 0; i < map->max_osd; i++)
-                map->osd_weight[i] = ceph_decode_32(p);
-        *p += 4; /* skip length field (should match max) */
-        ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
-        for (i = 0; i < map->max_osd; i++)
-                ceph_decode_addr(&map->osd_addr[i]);
-        /* pg_temp */
-        ceph_decode_32_safe(p, end, len, bad);
-        for (i = 0; i < len; i++) {
-                int n, j;
-                struct ceph_pg pgid;
-                struct ceph_pg_mapping *pg;
-                ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
-                ceph_decode_copy(p, &pgid, sizeof(pgid));
-                n = ceph_decode_32(p);
-                ceph_decode_need(p, end, n * sizeof(u32), bad);
-                err = -ENOMEM;
-                pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
-                if (!pg)
-                        goto bad;
-                pg->pgid = pgid;
-                pg->len = n;
-                for (j = 0; j < n; j++)
-                        pg->osds[j] = ceph_decode_32(p);
-                err = __insert_pg_mapping(pg, &map->pg_temp);
-                if (err)
-                        goto bad;
-                dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
-        }
-        /* crush */
-        ceph_decode_32_safe(p, end, len, bad);
-        dout("osdmap_decode crush len %d from off 0x%x\n", len,
-             (int)(*p - start));
-        ceph_decode_need(p, end, len, bad);
-        map->crush = crush_decode(*p, end);
-        *p += len;
-        if (IS_ERR(map->crush)) {
-                err = PTR_ERR(map->crush);
-                map->crush = NULL;
-                goto bad;
-        }
-        /* ignore the rest of the map */
-        *p = end;
-        dout("osdmap_decode done %p %p\n", *p, end);
-        return map;
-bad:
-        dout("osdmap_decode fail\n");
-        ceph_osdmap_destroy(map);
-        return ERR_PTR(err);
-}
-/*
- * decode and apply an incremental map update.
- */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                             struct ceph_osdmap *map,
-                                             struct ceph_messenger *msgr)
-{
-        struct crush_map *newcrush = NULL;
-        struct ceph_fsid fsid;
-        u32 epoch = 0;
-        struct ceph_timespec modified;
-        u32 len, pool;
-        __s32 new_pool_max, new_flags, max;
-        void *start = *p;
-        int err = -EINVAL;
-        u16 version;
-        struct rb_node *rbp;
-        ceph_decode_16_safe(p, end, version, bad);
-        if (version > CEPH_OSDMAP_INC_VERSION) {
-                pr_warning("got unknown v %d > %d of inc osdmap\n", version,
-                           CEPH_OSDMAP_INC_VERSION);
-                goto bad;
-        }
-        ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
-                         bad);
-        ceph_decode_copy(p, &fsid, sizeof(fsid));
-        epoch = ceph_decode_32(p);
-        BUG_ON(epoch != map->epoch+1);
-        ceph_decode_copy(p, &modified, sizeof(modified));
-        new_pool_max = ceph_decode_32(p);
-        new_flags = ceph_decode_32(p);
-        /* full map? */
-        ceph_decode_32_safe(p, end, len, bad);
-        if (len > 0) {
-                dout("apply_incremental full map len %d, %p to %p\n",
-                     len, *p, end);
-                return osdmap_decode(p, min(*p+len, end));
-        }
-        /* new crush? */
-        ceph_decode_32_safe(p, end, len, bad);
-        if (len > 0) {
-                dout("apply_incremental new crush map len %d, %p to %p\n",
-                     len, *p, end);
-                newcrush = crush_decode(*p, min(*p+len, end));
-                if (IS_ERR(newcrush))
-                        return ERR_CAST(newcrush);
-                *p += len;
-        }
-        /* new flags? */
-        if (new_flags >= 0)
-                map->flags = new_flags;
-        if (new_pool_max >= 0)
-                map->pool_max = new_pool_max;
-        ceph_decode_need(p, end, 5*sizeof(u32), bad);
-        /* new max? */
-        max = ceph_decode_32(p);
-        if (max >= 0) {
-                err = osdmap_set_max_osd(map, max);
-                if (err < 0)
-                        goto bad;
-        }
-        map->epoch++;
-        map->modified = map->modified;
-        if (newcrush) {
-                if (map->crush)
-                        crush_destroy(map->crush);
-                map->crush = newcrush;
-                newcrush = NULL;
-        }
-        /* new_pool */
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                __u8 ev;
-                struct ceph_pg_pool_info *pi;
-                ceph_decode_32_safe(p, end, pool, bad);
-                ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
-                ev = ceph_decode_8(p);  /* encoding version */
-                if (ev > CEPH_PG_POOL_VERSION) {
-                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-                                   ev, CEPH_PG_POOL_VERSION);
-                        goto bad;
-                }
-                pi = __lookup_pg_pool(&map->pg_pools, pool);
-                if (!pi) {
-                        pi = kzalloc(sizeof(*pi), GFP_NOFS);
-                        if (!pi) {
-                                err = -ENOMEM;
-                                goto bad;
-                        }
-                        pi->id = pool;
-                        __insert_pg_pool(&map->pg_pools, pi);
-                }
-                err = __decode_pool(p, end, pi);
-                if (err < 0)
-                        goto bad;
-        }
-        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-                goto bad;
-        /* old_pool */
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                struct ceph_pg_pool_info *pi;
-                ceph_decode_32_safe(p, end, pool, bad);
-                pi = __lookup_pg_pool(&map->pg_pools, pool);
-                if (pi)
-                        __remove_pg_pool(&map->pg_pools, pi);
-        }
-        /* new_up */
-        err = -EINVAL;
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                u32 osd;
-                struct ceph_entity_addr addr;
-                ceph_decode_32_safe(p, end, osd, bad);
-                ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
-                ceph_decode_addr(&addr);
-                pr_info("osd%d up\n", osd);
-                BUG_ON(osd >= map->max_osd);
-                map->osd_state[osd] |= CEPH_OSD_UP;
-                map->osd_addr[osd] = addr;
-        }
-        /* new_down */
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                u32 osd;
-                ceph_decode_32_safe(p, end, osd, bad);
-                (*p)++;  /* clean flag */
-                pr_info("osd%d down\n", osd);
-                if (osd < map->max_osd)
-                        map->osd_state[osd] &= ~CEPH_OSD_UP;
-        }
-        /* new_weight */
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                u32 osd, off;
-                ceph_decode_need(p, end, sizeof(u32)*2, bad);
-                osd = ceph_decode_32(p);
-                off = ceph_decode_32(p);
-                pr_info("osd%d weight 0x%x %s\n", osd, off,
-                     off == CEPH_OSD_IN ? "(in)" :
-                     (off == CEPH_OSD_OUT ? "(out)" : ""));
-                if (osd < map->max_osd)
-                        map->osd_weight[osd] = off;
-        }
-        /* new_pg_temp */
-        rbp = rb_first(&map->pg_temp);
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                struct ceph_pg_mapping *pg;
-                int j;
-                struct ceph_pg pgid;
-                u32 pglen;
-                ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
-                ceph_decode_copy(p, &pgid, sizeof(pgid));
-                pglen = ceph_decode_32(p);
-                /* remove any? */
-                while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
-                                                node)->pgid, pgid) <= 0) {
-                        struct ceph_pg_mapping *cur =
-                                rb_entry(rbp, struct ceph_pg_mapping, node);
-                        rbp = rb_next(rbp);
-                        dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                        rb_erase(&cur->node, &map->pg_temp);
-                        kfree(cur);
-                }
-                if (pglen) {
-                        /* insert */
-                        ceph_decode_need(p, end, pglen*sizeof(u32), bad);
-                        pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-                        if (!pg) {
-                                err = -ENOMEM;
-                                goto bad;
-                        }
-                        pg->pgid = pgid;
-                        pg->len = pglen;
-                        for (j = 0; j < pglen; j++)
-                                pg->osds[j] = ceph_decode_32(p);
-                        err = __insert_pg_mapping(pg, &map->pg_temp);
-                        if (err) {
-                                kfree(pg);
-                                goto bad;
-                        }
-                        dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
-                             pglen);
-                }
-        }
-        while (rbp) {
-                struct ceph_pg_mapping *cur =
-                        rb_entry(rbp, struct ceph_pg_mapping, node);
-                rbp = rb_next(rbp);
-                dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                rb_erase(&cur->node, &map->pg_temp);
-                kfree(cur);
-        }
-        /* ignore the rest */
-        *p = end;
-        return map;
-bad:
-        pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
-               epoch, (int)(*p - start), *p, start, end);
-        print_hex_dump(KERN_DEBUG, "osdmap: ",
-                       DUMP_PREFIX_OFFSET, 16, 1,
-                       start, end - start, true);
-        if (newcrush)
-                crush_destroy(newcrush);
-        return ERR_PTR(err);
-}
-/*
- * calculate file layout from given offset, length.
- * fill in correct oid, logical length, and object extent
- * offset, length.
- *
- * for now, we write only a single su, until we can
- * pass a stride back to the caller.
- */
-void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-                                   u64 off, u64 *plen,
-                                   u64 *ono,
-                                   u64 *oxoff, u64 *oxlen)
-{
-        u32 osize = le32_to_cpu(layout->fl_object_size);
-        u32 su = le32_to_cpu(layout->fl_stripe_unit);
-        u32 sc = le32_to_cpu(layout->fl_stripe_count);
-        u32 bl, stripeno, stripepos, objsetno;
-        u32 su_per_object;
-        u64 t, su_offset;
-        dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
-             osize, su);
-        su_per_object = osize / su;
-        dout("osize %u / su %u = su_per_object %u\n", osize, su,
-             su_per_object);
-        BUG_ON((su & ~PAGE_MASK) != 0);
-        /* bl = *off / su; */
-        t = off;
-        do_div(t, su);
-        bl = t;
-        dout("off %llu / su %u = bl %u\n", off, su, bl);
-        stripeno = bl / sc;
-        stripepos = bl % sc;
-        objsetno = stripeno / su_per_object;
-        *ono = objsetno * sc + stripepos;
-        dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
-        /* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
-        t = off;
-        su_offset = do_div(t, su);
-        *oxoff = su_offset + (stripeno % su_per_object) * su;
-        /*
-         * Calculate the length of the extent being written to the selected
-         * object. This is the minimum of the full length requested (plen) or
-         * the remainder of the current stripe being written to.
-         */
-        *oxlen = min_t(u64, *plen, su - su_offset);
-        *plen = *oxlen;
-        dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
-}
-/*
- * calculate an object layout (i.e. pgid) from an oid,
- * file_layout, and osdmap
- */
-int ceph_calc_object_layout(struct ceph_object_layout *ol,
-                            const char *oid,
-                            struct ceph_file_layout *fl,
-                            struct ceph_osdmap *osdmap)
-{
-        unsigned num, num_mask;
-        struct ceph_pg pgid;
-        s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
-        int poolid = le32_to_cpu(fl->fl_pg_pool);
-        struct ceph_pg_pool_info *pool;
-        unsigned ps;
-        BUG_ON(!osdmap);
-        pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
-        if (!pool)
-                return -EIO;
-        ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
-        if (preferred >= 0) {
-                ps += preferred;
-                num = le32_to_cpu(pool->v.lpg_num);
-                num_mask = pool->lpg_num_mask;
-        } else {
-                num = le32_to_cpu(pool->v.pg_num);
-                num_mask = pool->pg_num_mask;
-        }
-        pgid.ps = cpu_to_le16(ps);
-        pgid.preferred = cpu_to_le16(preferred);
-        pgid.pool = fl->fl_pg_pool;
-        if (preferred >= 0)
-                dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
-                     (int)preferred);
-        else
-                dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
-        ol->ol_pgid = pgid;
-        ol->ol_stripe_unit = fl->fl_object_stripe_unit;
-        return 0;
-}
-/*
- * Calculate raw osd vector for the given pgid.  Return pointer to osd
- * array, or NULL on failure.
- */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                        int *osds, int *num)
-{
-        struct ceph_pg_mapping *pg;
-        struct ceph_pg_pool_info *pool;
-        int ruleno;
-        unsigned poolid, ps, pps;
-        int preferred;
-        /* pg_temp? */
-        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
-        if (pg) {
-                *num = pg->len;
-                return pg->osds;
-        }
-        /* crush */
-        poolid = le32_to_cpu(pgid.pool);
-        ps = le16_to_cpu(pgid.ps);
-        preferred = (s16)le16_to_cpu(pgid.preferred);
-        /* don't forcefeed bad device ids to crush */
-        if (preferred >= osdmap->max_osd ||
-            preferred >= osdmap->crush->max_devices)
-                preferred = -1;
-        pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
-        if (!pool)
-                return NULL;
-        ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
-                                 pool->v.type, pool->v.size);
-        if (ruleno < 0) {
-                pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
-                       poolid, pool->v.crush_ruleset, pool->v.type,
-                       pool->v.size);
-                return NULL;
-        }
-        if (preferred >= 0)
-                pps = ceph_stable_mod(ps,
-                                      le32_to_cpu(pool->v.lpgp_num),
-                                      pool->lpgp_num_mask);
-        else
-                pps = ceph_stable_mod(ps,
-                                      le32_to_cpu(pool->v.pgp_num),
-                                      pool->pgp_num_mask);
-        pps += poolid;
-        *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
-                             min_t(int, pool->v.size, *num),
-                             preferred, osdmap->osd_weight);
-        return osds;
-}
-/*
- * Return acting set for given pgid.
- */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                        int *acting)
-{
-        int rawosds[CEPH_PG_MAX_SIZE], *osds;
-        int i, o, num = CEPH_PG_MAX_SIZE;
-        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-        if (!osds)
-                return -1;
-        /* primary is first up osd */
-        o = 0;
-        for (i = 0; i < num; i++)
-                if (ceph_osd_is_up(osdmap, osds[i]))
-                        acting[o++] = osds[i];
-        return o;
-}
-/*
- * Return primary osd for given pgid, or -1 if none.
- */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
-{
-        int rawosds[CEPH_PG_MAX_SIZE], *osds;
-        int i, num = CEPH_PG_MAX_SIZE;
-        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-        if (!osds)
-                return -1;
-        /* primary is first up osd */
-        for (i = 0; i < num; i++)
-                if (ceph_osd_is_up(osdmap, osds[i]))
-                        return osds[i];
-        return -1;
-}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
deleted file mode 100644
index 970b547e510d..000000000000
--- a/fs/ceph/osdmap.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef _FS_CEPH_OSDMAP_H
-#define _FS_CEPH_OSDMAP_H
-#include <linux/rbtree.h>
-#include "types.h"
-#include "ceph_fs.h"
-#include "crush/crush.h"
-/*
- * The osd map describes the current membership of the osd cluster and
- * specifies the mapping of objects to placement groups and placement
- * groups to (sets of) osds.  That is, it completely specifies the
- * (desired) distribution of all data objects in the system at some
- * point in time.
- *
- * Each map version is identified by an epoch, which increases monotonically.
- *
- * The map can be updated either via an incremental map (diff) describing
- * the change between two successive epochs, or as a fully encoded map.
- */
-struct ceph_pg_pool_info {
-        struct rb_node node;
-        int id;
-        struct ceph_pg_pool v;
-        int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
-        char *name;
-};
-struct ceph_pg_mapping {
-        struct rb_node node;
-        struct ceph_pg pgid;
-        int len;
-        int osds[];
-};
-struct ceph_osdmap {
-        struct ceph_fsid fsid;
-        u32 epoch;
-        u32 mkfs_epoch;
-        struct ceph_timespec created, modified;
-        u32 flags;         /* CEPH_OSDMAP_* */
-        u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
-        u8 *osd_state;     /* CEPH_OSD_* */
-        u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
-        struct ceph_entity_addr *osd_addr;
-        struct rb_root pg_temp;
-        struct rb_root pg_pools;
-        u32 pool_max;
-        /* the CRUSH map specifies the mapping of placement groups to
-         * the list of osds that store+replicate them. */
-        struct crush_map *crush;
-};
-/*
- * file layout helpers
- */
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
-        ((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
-        ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_preferred(l) \
-        ((__s32)le32_to_cpu((l).fl_pg_preferred))
-#define ceph_file_layout_pg_pool(l) \
-        ((__s32)le32_to_cpu((l).fl_pg_pool))
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
-        return le32_to_cpu(l->fl_stripe_unit) *
-                le32_to_cpu(l->fl_stripe_count);
-}
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
-        return le32_to_cpu(l->fl_object_size) *
-                le32_to_cpu(l->fl_stripe_count);
-}
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
-{
-        return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
-}
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
-{
-        return map && (map->flags & flag);
-}
-extern char *ceph_osdmap_state_str(char *str, int len, int state);
-static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
-                                                     int osd)
-{
-        if (osd >= map->max_osd)
-                return NULL;
-        return &map->osd_addr[osd];
-}
-extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                            struct ceph_osdmap *map,
-                                            struct ceph_messenger *msgr);
-extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
-/* calculate mapping of a file extent to an object */
-extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-                                          u64 off, u64 *plen,
-                                          u64 *bno, u64 *oxoff, u64 *oxlen);
-/* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
-                                   const char *oid,
-                                   struct ceph_file_layout *fl,
-                                   struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                               int *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-                                struct ceph_pg pgid);
-#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index 46a368b6dce5..000000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include "pagelist.h"
-static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
-{
-        struct page *page = list_entry(pl->head.prev, struct page,
-                                       lru);
-        kunmap(page);
-}
-int ceph_pagelist_release(struct ceph_pagelist *pl)
-{
-        if (pl->mapped_tail)
-                ceph_pagelist_unmap_tail(pl);
-        while (!list_empty(&pl->head)) {
-                struct page *page = list_first_entry(&pl->head, struct page,
-                                                     lru);
-                list_del(&page->lru);
-                __free_page(page);
-        }
-        return 0;
-}
-static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
-{
-        struct page *page = __page_cache_alloc(GFP_NOFS);
-        if (!page)
-                return -ENOMEM;
-        pl->room += PAGE_SIZE;
-        list_add_tail(&page->lru, &pl->head);
-        if (pl->mapped_tail)
-                ceph_pagelist_unmap_tail(pl);
-        pl->mapped_tail = kmap(page);
-        return 0;
-}
-int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
-{
-        while (pl->room < len) {
-                size_t bit = pl->room;
-                int ret;
-                memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
-                       buf, bit);
-                pl->length += bit;
-                pl->room -= bit;
-                buf += bit;
-                len -= bit;
-                ret = ceph_pagelist_addpage(pl);
-                if (ret)
-                        return ret;
-        }
-        memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
-        pl->length += len;
-        pl->room -= len;
-        return 0;
-}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
deleted file mode 100644
index e8a4187e1087..000000000000
--- a/fs/ceph/pagelist.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __FS_CEPH_PAGELIST_H
-#define __FS_CEPH_PAGELIST_H
-#include <linux/list.h>
-struct ceph_pagelist {
-        struct list_head head;
-        void *mapped_tail;
-        size_t length;
-        size_t room;
-};
-static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
-{
-        INIT_LIST_HEAD(&pl->head);
-        pl->mapped_tail = NULL;
-        pl->length = 0;
-        pl->room = 0;
-}
-extern int ceph_pagelist_release(struct ceph_pagelist *pl);
-extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
-static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
-{
-        __le64 ev = cpu_to_le64(v);
-        return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
-{
-        __le32 ev = cpu_to_le32(v);
-        return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
-{
-        __le16 ev = cpu_to_le16(v);
-        return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
-{
-        return ceph_pagelist_append(pl, &v, 1);
-}
-static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
-                                              char *s, size_t len)
-{
-        int ret = ceph_pagelist_encode_32(pl, len);
-        if (ret)
-                return ret;
-        if (len)
-                return ceph_pagelist_append(pl, s, len);
-        return 0;
-}
-#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
deleted file mode 100644
index 6d5247f2e81b..000000000000
--- a/fs/ceph/rados.h
+++ /dev/null
@@ -1,405 +0,0 @@
-#ifndef CEPH_RADOS_H
-#define CEPH_RADOS_H
-/*
- * Data types for the Ceph distributed object storage layer RADOS
- * (Reliable Autonomic Distributed Object Store).
- */
-#include "msgr.h"
-/*
- * osdmap encoding versions
- */
-#define CEPH_OSDMAP_INC_VERSION     5
-#define CEPH_OSDMAP_INC_VERSION_EXT 5
-#define CEPH_OSDMAP_VERSION         5
-#define CEPH_OSDMAP_VERSION_EXT     5
-/*
- * fs id
- */
-struct ceph_fsid {
-        unsigned char fsid[16];
-};
-static inline int ceph_fsid_compare(const struct ceph_fsid *a,
-                                    const struct ceph_fsid *b)
-{
-        return memcmp(a, b, sizeof(*a));
-}
-/*
- * ino, object, etc.
- */
-typedef __le64 ceph_snapid_t;
-#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
-#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
-#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
-struct ceph_timespec {
-        __le32 tv_sec;
-        __le32 tv_nsec;
-} __attribute__ ((packed));
-/*
- * object layout - how objects are mapped into PGs
- */
-#define CEPH_OBJECT_LAYOUT_HASH     1
-#define CEPH_OBJECT_LAYOUT_LINEAR   2
-#define CEPH_OBJECT_LAYOUT_HASHINO  3
-/*
- * pg layout -- how PGs are mapped onto (sets of) OSDs
- */
-#define CEPH_PG_LAYOUT_CRUSH  0
-#define CEPH_PG_LAYOUT_HASH   1
-#define CEPH_PG_LAYOUT_LINEAR 2
-#define CEPH_PG_LAYOUT_HYBRID 3
-#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
-/*
- * placement group.
- * we encode this into one __le64.
- */
-struct ceph_pg {
-        __le16 preferred; /* preferred primary osd */
-        __le16 ps;        /* placement seed */
-        __le32 pool;      /* object pool */
-} __attribute__ ((packed));
-/*
- * pg_pool is a set of pgs storing a pool of objects
- *
- *  pg_num -- base number of pseudorandomly placed pgs
- *
- *  pgp_num -- effective number when calculating pg placement.  this
- * is used for pg_num increases.  new pgs result in data being "split"
- * into new pgs.  for this to proceed smoothly, new pgs are intiially
- * colocated with their parents; that is, pgp_num doesn't increase
- * until the new pgs have successfully split.  only _then_ are the new
- * pgs placed independently.
- *
- *  lpg_num -- localized pg count (per device).  replicas are randomly
- * selected.
- *
- *  lpgp_num -- as above.
- */
-#define CEPH_PG_TYPE_REP     1
-#define CEPH_PG_TYPE_RAID4   2
-#define CEPH_PG_POOL_VERSION 2
-struct ceph_pg_pool {
-        __u8 type;                /* CEPH_PG_TYPE_* */
-        __u8 size;                /* number of osds in each pg */
-        __u8 crush_ruleset;       /* crush placement rule */
-        __u8 object_hash;         /* hash mapping object name to ps */
-        __le32 pg_num, pgp_num;   /* number of pg's */
-        __le32 lpg_num, lpgp_num; /* number of localized pg's */
-        __le32 last_change;       /* most recent epoch changed */
-        __le64 snap_seq;          /* seq for per-pool snapshot */
-        __le32 snap_epoch;        /* epoch of last snap */
-        __le32 num_snaps;
-        __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-        __le64 auid;               /* who owns the pg */
-} __attribute__ ((packed));
-/*
- * stable_mod func is used to control number of placement groups.
- * similar to straight-up modulo, but produces a stable mapping as b
- * increases over time.  b is the number of bins, and bmask is the
- * containing power of 2 minus 1.
- *
- * b <= bmask and bmask=(2**n)-1
- * e.g., b=12 -> bmask=15, b=123 -> bmask=127
- */
-static inline int ceph_stable_mod(int x, int b, int bmask)
-{
-        if ((x & bmask) < b)
-                return x & bmask;
-        else
-                return x & (bmask >> 1);
-}
-/*
- * object layout - how a given object should be stored.
- */
-struct ceph_object_layout {
-        struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
-        __le32 ol_stripe_unit;    /* for per-object parity, if any */
-} __attribute__ ((packed));
-/*
- * compound epoch+version, used by storage layer to serialize mutations
- */
-struct ceph_eversion {
-        __le32 epoch;
-        __le64 version;
-} __attribute__ ((packed));
-/*
- * osd map bits
- */
-/* status bits */
-#define CEPH_OSD_EXISTS 1
-#define CEPH_OSD_UP     2
-/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
-#define CEPH_OSD_IN  0x10000
-#define CEPH_OSD_OUT 0
-/*
- * osd map flag bits
- */
-#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
-#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
-#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
-#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
-#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
-/*
- * osd ops
- */
-#define CEPH_OSD_OP_MODE       0xf000
-#define CEPH_OSD_OP_MODE_RD    0x1000
-#define CEPH_OSD_OP_MODE_WR    0x2000
-#define CEPH_OSD_OP_MODE_RMW   0x3000
-#define CEPH_OSD_OP_MODE_SUB   0x4000
-#define CEPH_OSD_OP_TYPE       0x0f00
-#define CEPH_OSD_OP_TYPE_LOCK  0x0100
-#define CEPH_OSD_OP_TYPE_DATA  0x0200
-#define CEPH_OSD_OP_TYPE_ATTR  0x0300
-#define CEPH_OSD_OP_TYPE_EXEC  0x0400
-#define CEPH_OSD_OP_TYPE_PG    0x0500
-enum {
-        /** data **/
-        /* read */
-        CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
-        CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
-        /* fancy read */
-        CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
-        /* write */
-        CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
-        CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
-        CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
-        CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
-        CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
-        /* fancy write */
-        CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
-        CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
-        CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
-        CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
-        CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
-        CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
-        CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
-        CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
-        CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
-        /** attrs **/
-        /* read */
-        CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
-        CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
-        CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
-        /* write */
-        CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
-        CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
-        CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
-        CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
-        /** subop **/
-        CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
-        CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
-        CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
-        CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
-        CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
-        /** lock **/
-        CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
-        CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
-        CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
-        CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
-        CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
-        CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
-        /** exec **/
-        CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
-        /** pg **/
-        CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
-};
-static inline int ceph_osd_op_type_lock(int op)
-{
-        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
-}
-static inline int ceph_osd_op_type_data(int op)
-{
-        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
-}
-static inline int ceph_osd_op_type_attr(int op)
-{
-        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
-}
-static inline int ceph_osd_op_type_exec(int op)
-{
-        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
-}
-static inline int ceph_osd_op_type_pg(int op)
-{
-        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
-}
-static inline int ceph_osd_op_mode_subop(int op)
-{
-        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
-}
-static inline int ceph_osd_op_mode_read(int op)
-{
-        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
-}
-static inline int ceph_osd_op_mode_modify(int op)
-{
-        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
-}
-/*
- * note that the following tmap stuff is also defined in the ceph librados.h
- * any modification here needs to be updated there
- */
-#define CEPH_OSD_TMAP_HDR 'h'
-#define CEPH_OSD_TMAP_SET 's'
-#define CEPH_OSD_TMAP_RM  'r'
-extern const char *ceph_osd_op_name(int op);
-/*
- * osd op flags
- *
- * An op may be READ, WRITE, or READ|WRITE.
- */
-enum {
-        CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
-        CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
-        CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
-        CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
-        CEPH_OSD_FLAG_READ = 16,        /* op may read */
-        CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
-        CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
-        CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
-        CEPH_OSD_FLAG_BALANCE_READS = 256,
-        CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
-        CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
-        CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
-        CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
-};
-enum {
-        CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
-};
-#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
-#define EBLACKLISTED ESHUTDOWN /* blacklisted */
-/* xattr comparison */
-enum {
-        CEPH_OSD_CMPXATTR_OP_NOP = 0,
-        CEPH_OSD_CMPXATTR_OP_EQ  = 1,
-        CEPH_OSD_CMPXATTR_OP_NE  = 2,
-        CEPH_OSD_CMPXATTR_OP_GT  = 3,
-        CEPH_OSD_CMPXATTR_OP_GTE = 4,
-        CEPH_OSD_CMPXATTR_OP_LT  = 5,
-        CEPH_OSD_CMPXATTR_OP_LTE = 6
-};
-enum {
-        CEPH_OSD_CMPXATTR_MODE_STRING = 1,
-        CEPH_OSD_CMPXATTR_MODE_U64    = 2
-};
-/*
- * an individual object operation.  each may be accompanied by some data
- * payload
- */
-struct ceph_osd_op {
-        __le16 op;           /* CEPH_OSD_OP_* */
-        __le32 flags;        /* CEPH_OSD_FLAG_* */
-        union {
-                struct {
-                        __le64 offset, length;
-                        __le64 truncate_size;
-                        __le32 truncate_seq;
-                } __attribute__ ((packed)) extent;
-                struct {
-                        __le32 name_len;
-                        __le32 value_len;
-                        __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
-                        __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
-                } __attribute__ ((packed)) xattr;
-                struct {
-                        __u8 class_len;
-                        __u8 method_len;
-                        __u8 argc;
-                        __le32 indata_len;
-                } __attribute__ ((packed)) cls;
-                struct {
-                        __le64 cookie, count;
-                } __attribute__ ((packed)) pgls;
-                struct {
-                        __le64 snapid;
-                } __attribute__ ((packed)) snap;
-        };
-        __le32 payload_len;
-} __attribute__ ((packed));
-/*
- * osd request message header.  each request may include multiple
- * ceph_osd_op object operations.
- */
-struct ceph_osd_request_head {
-        __le32 client_inc;                 /* client incarnation */
-        struct ceph_object_layout layout;  /* pgid */
-        __le32 osdmap_epoch;               /* client's osdmap epoch */
-        __le32 flags;
-        struct ceph_timespec mtime;        /* for mutations only */
-        struct ceph_eversion reassert_version; /* if we are replaying op */
-        __le32 object_len;     /* length of object name */
-        __le64 snapid;         /* snapid to read */
-        __le64 snap_seq;       /* writer's snap context */
-        __le32 num_snaps;
-        __le16 num_ops;
-        struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
-} __attribute__ ((packed));
-struct ceph_osd_reply_head {
-        __le32 client_inc;                /* client incarnation */
-        __le32 flags;
-        struct ceph_object_layout layout;
-        __le32 osdmap_epoch;
-        struct ceph_eversion reassert_version; /* for replaying uncommitted */
-        __le32 result;                    /* result code */
-        __le32 object_len;                /* length of object name */
-        __le32 num_ops;
-        struct ceph_osd_op ops[0];  /* ops[], object */
-} __attribute__ ((packed));
-#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 190b6c4a6f2b..39c243acd062 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/sort.h>
 #include <linux/slab.h>
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
 /*
 * Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                            struct ceph_cap_snap *capsnap)
 {
        struct inode *inode = &ci->vfs_inode;
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        BUG_ON(capsnap->writing);
        capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                      struct ceph_mds_session *session,
                      struct ceph_msg *msg)
 {
-        struct super_block *sb = mdsc->client->sb;
+        struct super_block *sb = mdsc->fsc->sb;
        int mds = session->s_mds;
        u64 split;
        int op;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index c6179d3a26a2..cd5097d7c804 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,71 +1,9 @@
 /*
- * Ceph string constants
+ * Ceph fs string constants
 */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
-const char *ceph_entity_type_name(int type)
-{
-        switch (type) {
-        case CEPH_ENTITY_TYPE_MDS: return "mds";
-        case CEPH_ENTITY_TYPE_OSD: return "osd";
-        case CEPH_ENTITY_TYPE_MON: return "mon";
-        case CEPH_ENTITY_TYPE_CLIENT: return "client";
-        case CEPH_ENTITY_TYPE_AUTH: return "auth";
-        default: return "unknown";
-        }
-}
-const char *ceph_osd_op_name(int op)
-{
-        switch (op) {
-        case CEPH_OSD_OP_READ: return "read";
-        case CEPH_OSD_OP_STAT: return "stat";
-        case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
-        case CEPH_OSD_OP_WRITE: return "write";
-        case CEPH_OSD_OP_DELETE: return "delete";
-        case CEPH_OSD_OP_TRUNCATE: return "truncate";
-        case CEPH_OSD_OP_ZERO: return "zero";
-        case CEPH_OSD_OP_WRITEFULL: return "writefull";
-        case CEPH_OSD_OP_ROLLBACK: return "rollback";
-        case CEPH_OSD_OP_APPEND: return "append";
-        case CEPH_OSD_OP_STARTSYNC: return "startsync";
-        case CEPH_OSD_OP_SETTRUNC: return "settrunc";
-        case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
-        case CEPH_OSD_OP_TMAPUP: return "tmapup";
-        case CEPH_OSD_OP_TMAPGET: return "tmapget";
-        case CEPH_OSD_OP_TMAPPUT: return "tmapput";
-        case CEPH_OSD_OP_GETXATTR: return "getxattr";
-        case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
-        case CEPH_OSD_OP_SETXATTR: return "setxattr";
-        case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
-        case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
-        case CEPH_OSD_OP_RMXATTR: return "rmxattr";
-        case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
-        case CEPH_OSD_OP_PULL: return "pull";
-        case CEPH_OSD_OP_PUSH: return "push";
-        case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
-        case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
-        case CEPH_OSD_OP_SCRUB: return "scrub";
-        case CEPH_OSD_OP_WRLOCK: return "wrlock";
-        case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
-        case CEPH_OSD_OP_RDLOCK: return "rdlock";
-        case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
-        case CEPH_OSD_OP_UPLOCK: return "uplock";
-        case CEPH_OSD_OP_DNLOCK: return "dnlock";
-        case CEPH_OSD_OP_CALL: return "call";
-        case CEPH_OSD_OP_PGLS: return "pgls";
-        }
-        return "???";
-}
 const char *ceph_mds_state_name(int s)
 {
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
        }
        return "???";
 }
-const char *ceph_pool_op_name(int op)
-{
-        switch (op) {
-        case POOL_OP_CREATE: return "create";
-        case POOL_OP_DELETE: return "delete";
-        case POOL_OP_AUID_CHANGE: return "auid change";
-        case POOL_OP_CREATE_SNAP: return "create snap";
-        case POOL_OP_DELETE_SNAP: return "delete snap";
-        case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
-        case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
-        }
-        return "???";
-}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9922628532b2..9c5085465a63 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/backing-dev.h>
 #include <linux/ctype.h>
@@ -15,10 +15,13 @@
 #include <linux/statfs.h>
 #include <linux/string.h>
-#include "decode.h"
 #include "super.h"
-#include "mon_client.h"
+#include "mds_client.h"
-#include "auth.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 /*
 * Ceph superblock operations
@@ -26,36 +29,22 @@
 * Handle the basics of mounting, unmounting.
 */
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
-        const char *e = s + len;
-        while (e != s && *(e-1) != '/')
-                e--;
-        return e;
-}
 /*
 * super ops
 */
 static void ceph_put_super(struct super_block *s)
 {
-        struct ceph_client *client = ceph_sb_to_client(s);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(s);
        dout("put_super\n");
-        ceph_mdsc_close_sessions(&client->mdsc);
+        ceph_mdsc_close_sessions(fsc->mdsc);
        /*
         * ensure we release the bdi before put_anon_super releases
         * the device name.
         */
-        if (s->s_bdi == &client->backing_dev_info) {
+        if (s->s_bdi == &fsc->backing_dev_info) {
-                bdi_unregister(&client->backing_dev_info);
+                bdi_unregister(&fsc->backing_dev_info);
                s->s_bdi = NULL;
        }
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
-        struct ceph_monmap *monmap = client->monc.monmap;
+        struct ceph_monmap *monmap = fsc->client->monc.monmap;
        struct ceph_statfs st;
        u64 fsid;
        int err;
        dout("statfs\n");
-        err = ceph_monc_do_statfs(&client->monc, &st);
+        err = ceph_monc_do_statfs(&fsc->client->monc, &st);
        if (err < 0)
                return err;
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int ceph_sync_fs(struct super_block *sb, int wait)
 {
-        struct ceph_client *client = ceph_sb_to_client(sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
        if (!wait) {
                dout("sync_fs (non-blocking)\n");
-                ceph_flush_dirty_caps(&client->mdsc);
+                ceph_flush_dirty_caps(fsc->mdsc);
                dout("sync_fs (non-blocking) done\n");
                return 0;
        }
        dout("sync_fs (blocking)\n");
-        ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
+        ceph_osdc_sync(&fsc->client->osdc);
-        ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
+        ceph_mdsc_sync(fsc->mdsc);
        dout("sync_fs (blocking) done\n");
        return 0;
 }
-static int default_congestion_kb(void)
-{
-        int congestion_kb;
-        /*
-         * Copied from NFS
-         *
-         * congestion size, scale with available memory.
-         *
-         *  64MB:    8192k
-         * 128MB:   11585k
-         * 256MB:   16384k
-         * 512MB:   23170k
-         *   1GB:   32768k
-         *   2GB:   46340k
-         *   4GB:   65536k
-         *   8GB:   92681k
-         *  16GB:  131072k
-         *
-         * This allows larger machines to have larger/more transfers.
-         * Limit the default to 256M
-         */
-        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-        if (congestion_kb > 256*1024)
-                congestion_kb = 256*1024;
-        return congestion_kb;
-}
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-        struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
-        struct ceph_mount_args *args = client->mount_args;
-        if (args->flags & CEPH_OPT_FSID)
-                seq_printf(m, ",fsid=%pU", &args->fsid);
-        if (args->flags & CEPH_OPT_NOSHARE)
-                seq_puts(m, ",noshare");
-        if (args->flags & CEPH_OPT_DIRSTAT)
-                seq_puts(m, ",dirstat");
-        if ((args->flags & CEPH_OPT_RBYTES) == 0)
-                seq_puts(m, ",norbytes");
-        if (args->flags & CEPH_OPT_NOCRC)
-                seq_puts(m, ",nocrc");
-        if (args->flags & CEPH_OPT_NOASYNCREADDIR)
-                seq_puts(m, ",noasyncreaddir");
-        if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-                seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
-        if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-                seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
-        if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-                seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
-        if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
-                seq_printf(m, ",osdkeepalivetimeout=%d",
-                         args->osd_keepalive_timeout);
-        if (args->wsize)
-                seq_printf(m, ",wsize=%d", args->wsize);
-        if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
-                seq_printf(m, ",rsize=%d", args->rsize);
-        if (args->congestion_kb != default_congestion_kb())
-                seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
-        if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
-                seq_printf(m, ",caps_wanted_delay_min=%d",
-                         args->caps_wanted_delay_min);
-        if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
-                seq_printf(m, ",caps_wanted_delay_max=%d",
-                           args->caps_wanted_delay_max);
-        if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
-                seq_printf(m, ",cap_release_safety=%d",
-                           args->cap_release_safety);
-        if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
-                seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
-        if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
-                seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
-        if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-                seq_printf(m, ",snapdirname=%s", args->snapdir_name);
-        if (args->name)
-                seq_printf(m, ",name=%s", args->name);
-        if (args->secret)
-                seq_puts(m, ",secret=<hidden>");
-        return 0;
-}
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-static void ceph_inode_init_once(void *foo)
-{
-        struct ceph_inode_info *ci = foo;
-        inode_init_once(&ci->vfs_inode);
-}
-static int __init init_caches(void)
-{
-        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
-                                      sizeof(struct ceph_inode_info),
-                                      __alignof__(struct ceph_inode_info),
-                                      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-                                      ceph_inode_init_once);
-        if (ceph_inode_cachep == NULL)
-                return -ENOMEM;
-        ceph_cap_cachep = KMEM_CACHE(ceph_cap,
-                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-        if (ceph_cap_cachep == NULL)
-                goto bad_cap;
-        ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
-                                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-        if (ceph_dentry_cachep == NULL)
-                goto bad_dentry;
-        ceph_file_cachep = KMEM_CACHE(ceph_file_info,
-                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-        if (ceph_file_cachep == NULL)
-                goto bad_file;
-        return 0;
-bad_file:
-        kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
-        kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
-        kmem_cache_destroy(ceph_inode_cachep);
-        return -ENOMEM;
-}
-static void destroy_caches(void)
-{
-        kmem_cache_destroy(ceph_inode_cachep);
-        kmem_cache_destroy(ceph_cap_cachep);
-        kmem_cache_destroy(ceph_dentry_cachep);
-        kmem_cache_destroy(ceph_file_cachep);
-}
-/*
- * ceph_umount_begin - initiate forced umount.  Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-static void ceph_umount_begin(struct super_block *sb)
-{
-        struct ceph_client *client = ceph_sb_to_client(sb);
-        dout("ceph_umount_begin - starting forced umount\n");
-        if (!client)
-                return;
-        client->mount_state = CEPH_MOUNT_SHUTDOWN;
-        return;
-}
-static const struct super_operations ceph_super_ops = {
-        .alloc_inode    = ceph_alloc_inode,
-        .destroy_inode  = ceph_destroy_inode,
-        .write_inode    = ceph_write_inode,
-        .sync_fs        = ceph_sync_fs,
-        .put_super      = ceph_put_super,
-        .show_options   = ceph_show_options,
-        .statfs         = ceph_statfs,
-        .umount_begin   = ceph_umount_begin,
-};
-const char *ceph_msg_type_name(int type)
-{
-        switch (type) {
-        case CEPH_MSG_SHUTDOWN: return "shutdown";
-        case CEPH_MSG_PING: return "ping";
-        case CEPH_MSG_AUTH: return "auth";
-        case CEPH_MSG_AUTH_REPLY: return "auth_reply";
-        case CEPH_MSG_MON_MAP: return "mon_map";
-        case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
-        case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
-        case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
-        case CEPH_MSG_STATFS: return "statfs";
-        case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
-        case CEPH_MSG_MDS_MAP: return "mds_map";
-        case CEPH_MSG_CLIENT_SESSION: return "client_session";
-        case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
-        case CEPH_MSG_CLIENT_REQUEST: return "client_request";
-        case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
-        case CEPH_MSG_CLIENT_REPLY: return "client_reply";
-        case CEPH_MSG_CLIENT_CAPS: return "client_caps";
-        case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
-        case CEPH_MSG_CLIENT_SNAP: return "client_snap";
-        case CEPH_MSG_CLIENT_LEASE: return "client_lease";
-        case CEPH_MSG_OSD_MAP: return "osd_map";
-        case CEPH_MSG_OSD_OP: return "osd_op";
-        case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
-        default: return "unknown";
-        }
-}
 /*
 * mount options
 */
 enum {
        Opt_wsize,
        Opt_rsize,
-        Opt_osdtimeout,
-        Opt_osdkeepalivetimeout,
-        Opt_mount_timeout,
-        Opt_osd_idle_ttl,
        Opt_caps_wanted_delay_min,
        Opt_caps_wanted_delay_max,
        Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
        Opt_congestion_kb,
        Opt_last_int,
        /* int args above */
-        Opt_fsid,
        Opt_snapdirname,
-        Opt_name,
-        Opt_secret,
        Opt_last_string,
        /* string args above */
-        Opt_ip,
-        Opt_noshare,
        Opt_dirstat,
        Opt_nodirstat,
        Opt_rbytes,
        Opt_norbytes,
-        Opt_nocrc,
        Opt_noasyncreaddir,
 };
-static match_table_t arg_tokens = {
+static match_table_t fsopt_tokens = {
        {Opt_wsize, "wsize=%d"},
        {Opt_rsize, "rsize=%d"},
-        {Opt_osdtimeout, "osdtimeout=%d"},
-        {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
-        {Opt_mount_timeout, "mount_timeout=%d"},
-        {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
        {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
        {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
        {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,466 @@ static match_table_t arg_tokens = {
        {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
        {Opt_congestion_kb, "write_congestion_kb=%d"},
        /* int args above */
-        {Opt_fsid, "fsid=%s"},
        {Opt_snapdirname, "snapdirname=%s"},
-        {Opt_name, "name=%s"},
-        {Opt_secret, "secret=%s"},
        /* string args above */
-        {Opt_ip, "ip=%s"},
-        {Opt_noshare, "noshare"},
        {Opt_dirstat, "dirstat"},
        {Opt_nodirstat, "nodirstat"},
        {Opt_rbytes, "rbytes"},
        {Opt_norbytes, "norbytes"},
-        {Opt_nocrc, "nocrc"},
        {Opt_noasyncreaddir, "noasyncreaddir"},
        {-1, NULL}
 };
-static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+static int parse_fsopt_token(char *c, void *private)
 {
-        int i = 0;
+        struct ceph_mount_options *fsopt = private;
-        char tmp[3];
+        substring_t argstr[MAX_OPT_ARGS];
-        int err = -EINVAL;
+        int token, intval, ret;
-        int d;
+        token = match_token((char *)c, fsopt_tokens, argstr);
-        dout("parse_fsid '%s'\n", str);
+        if (token < 0)
-        tmp[2] = 0;
+                return -EINVAL;
-        while (*str && i < 16) {
-                if (ispunct(*str)) {
+        if (token < Opt_last_int) {
-                        str++;
+                ret = match_int(&argstr[0], &intval);
-                        continue;
+                if (ret < 0) {
+                        pr_err("bad mount option arg (not int) "
+                               "at '%s'\n", c);
+                        return ret;
                }
-                if (!isxdigit(str[0]) || !isxdigit(str[1]))
+                dout("got int token %d val %d\n", token, intval);
-                        break;
+        } else if (token > Opt_last_int && token < Opt_last_string) {
-                tmp[0] = str[0];
+                dout("got string token %d val %s\n", token,
-                tmp[1] = str[1];
+                     argstr[0].from);
-                if (sscanf(tmp, "%x", &d) < 1)
+        } else {
-                        break;
+                dout("got token %d\n", token);
-                fsid->fsid[i] = d & 0xff;
-                i++;
-                str += 2;
        }
-        if (i == 16)
+        switch (token) {
-                err = 0;
+        case Opt_snapdirname:
-        dout("parse_fsid ret %d got fsid %pU", err, fsid);
+                kfree(fsopt->snapdir_name);
-        return err;
+                fsopt->snapdir_name = kstrndup(argstr[0].from,
+                                               argstr[0].to-argstr[0].from,
+                                               GFP_KERNEL);
+                if (!fsopt->snapdir_name)
+                        return -ENOMEM;
+                break;
+                /* misc */
+        case Opt_wsize:
+                fsopt->wsize = intval;
+                break;
+        case Opt_rsize:
+                fsopt->rsize = intval;
+                break;
+        case Opt_caps_wanted_delay_min:
+                fsopt->caps_wanted_delay_min = intval;
+                break;
+        case Opt_caps_wanted_delay_max:
+                fsopt->caps_wanted_delay_max = intval;
+                break;
+        case Opt_readdir_max_entries:
+                fsopt->max_readdir = intval;
+                break;
+        case Opt_readdir_max_bytes:
+                fsopt->max_readdir_bytes = intval;
+                break;
+        case Opt_congestion_kb:
+                fsopt->congestion_kb = intval;
+                break;
+        case Opt_dirstat:
+                fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+                break;
+        case Opt_nodirstat:
+                fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+                break;
+        case Opt_rbytes:
+                fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+                break;
+        case Opt_norbytes:
+                fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+                break;
+        case Opt_noasyncreaddir:
+                fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+                break;
+        default:
+                BUG_ON(token);
+        }
+        return 0;
 }
-static struct ceph_mount_args *parse_mount_args(int flags, char *options,
+static void destroy_mount_options(struct ceph_mount_options *args)
-                                                const char *dev_name,
-                                                const char **path)
 {
-        struct ceph_mount_args *args;
+        dout("destroy_mount_options %p\n", args);
-        const char *c;
+        kfree(args->snapdir_name);
-        int err = -ENOMEM;
+        kfree(args);
-        substring_t argstr[MAX_OPT_ARGS];
+}
-        args = kzalloc(sizeof(*args), GFP_KERNEL);
+static int strcmp_null(const char *s1, const char *s2)
-        if (!args)
+{
-                return ERR_PTR(-ENOMEM);
+        if (!s1 && !s2)
-        args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
+                return 0;
-                                 GFP_KERNEL);
+        if (s1 && !s2)
-        if (!args->mon_addr)
+                return -1;
-                goto out;
+        if (!s1 && s2)
+                return 1;
+        return strcmp(s1, s2);
+}
-        dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+                                 struct ceph_options *new_opt,
-        /* start with defaults */
+                                 struct ceph_fs_client *fsc)
-        args->sb_flags = flags;
+{
-        args->flags = CEPH_OPT_DEFAULT;
+        struct ceph_mount_options *fsopt1 = new_fsopt;
-        args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+        struct ceph_mount_options *fsopt2 = fsc->mount_options;
-        args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+        int ofs = offsetof(struct ceph_mount_options, snapdir_name);
-        args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+        int ret;
-        args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
-        args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
-        args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
-        args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
-        args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-        args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-        args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
-        args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-        args->congestion_kb = default_congestion_kb();
-        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
-        err = -EINVAL;
-        if (!dev_name)
-                goto out;
-        *path = strstr(dev_name, ":/");
-        if (*path == NULL) {
-                pr_err("device name is missing path (no :/ in %s)\n",
-                       dev_name);
-                goto out;
-        }
-        /* get mon ip(s) */
+        ret = memcmp(fsopt1, fsopt2, ofs);
-        err = ceph_parse_ips(dev_name, *path, args->mon_addr,
+        if (ret)
-                             CEPH_MAX_MON, &args->num_mon);
+                return ret;
-        if (err < 0)
-                goto out;
+        ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+        if (ret)
+                return ret;
+        return ceph_compare_options(new_opt, fsc->client);
+}
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+                               struct ceph_options **popt,
+                               int flags, char *options,
+                               const char *dev_name,
+                               const char **path)
+{
+        struct ceph_mount_options *fsopt;
+        const char *dev_name_end;
+        int err = -ENOMEM;
+        fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+        if (!fsopt)
+                return -ENOMEM;
+        dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+        fsopt->sb_flags = flags;
+        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+        fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+        fsopt->congestion_kb = default_congestion_kb();
+        
+        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+        err = -EINVAL;
+        if (!dev_name)
+                goto out;
+        *path = strstr(dev_name, ":/");
+        if (*path == NULL) {
+                pr_err("device name is missing path (no :/ in %s)\n",
+                       dev_name);
+                goto out;
+        }
+        dev_name_end = *path;
+        dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
        /* path on server */
        *path += 2;
        dout("server path '%s'\n", *path);
-        /* parse mount options */
+        err = ceph_parse_options(popt, options, dev_name, dev_name_end,
-        while ((c = strsep(&options, ",")) != NULL) {
+                                 parse_fsopt_token, (void *)fsopt);
-                int token, intval, ret;
+        if (err)
-                if (!*c)
+                goto out;
-                        continue;
-                err = -EINVAL;
+        /* success */
-                token = match_token((char *)c, arg_tokens, argstr);
+        *pfsopt = fsopt;
-                if (token < 0) {
+        return 0;
-                        pr_err("bad mount option at '%s'\n", c);
-                        goto out;
-                }
-                if (token < Opt_last_int) {
-                        ret = match_int(&argstr[0], &intval);
-                        if (ret < 0) {
-                                pr_err("bad mount option arg (not int) "
-                                       "at '%s'\n", c);
-                                continue;
-                        }
-                        dout("got int token %d val %d\n", token, intval);
-                } else if (token > Opt_last_int && token < Opt_last_string) {
-                        dout("got string token %d val %s\n", token,
-                             argstr[0].from);
-                } else {
-                        dout("got token %d\n", token);
-                }
-                switch (token) {
-                case Opt_ip:
-                        err = ceph_parse_ips(argstr[0].from,
-                                             argstr[0].to,
-                                             &args->my_addr,
-                                             1, NULL);
-                        if (err < 0)
-                                goto out;
-                        args->flags |= CEPH_OPT_MYIP;
-                        break;
-                case Opt_fsid:
-                        err = parse_fsid(argstr[0].from, &args->fsid);
-                        if (err == 0)
-                                args->flags |= CEPH_OPT_FSID;
-                        break;
-                case Opt_snapdirname:
-                        kfree(args->snapdir_name);
-                        args->snapdir_name = kstrndup(argstr[0].from,
-                                              argstr[0].to-argstr[0].from,
-                                              GFP_KERNEL);
-                        break;
-                case Opt_name:
-                        args->name = kstrndup(argstr[0].from,
-                                              argstr[0].to-argstr[0].from,
-                                              GFP_KERNEL);
-                        break;
-                case Opt_secret:
-                        args->secret = kstrndup(argstr[0].from,
-                                                argstr[0].to-argstr[0].from,
-                                                GFP_KERNEL);
-                        break;
-                        /* misc */
-                case Opt_wsize:
-                        args->wsize = intval;
-                        break;
-                case Opt_rsize:
-                        args->rsize = intval;
-                        break;
-                case Opt_osdtimeout:
-                        args->osd_timeout = intval;
-                        break;
-                case Opt_osdkeepalivetimeout:
-                        args->osd_keepalive_timeout = intval;
-                        break;
-                case Opt_osd_idle_ttl:
-                        args->osd_idle_ttl = intval;
-                        break;
-                case Opt_mount_timeout:
-                        args->mount_timeout = intval;
-                        break;
-                case Opt_caps_wanted_delay_min:
-                        args->caps_wanted_delay_min = intval;
-                        break;
-                case Opt_caps_wanted_delay_max:
-                        args->caps_wanted_delay_max = intval;
-                        break;
-                case Opt_readdir_max_entries:
-                        args->max_readdir = intval;
-                        break;
-                case Opt_readdir_max_bytes:
-                        args->max_readdir_bytes = intval;
-                        break;
-                case Opt_congestion_kb:
-                        args->congestion_kb = intval;
-                        break;
-                case Opt_noshare:
-                        args->flags |= CEPH_OPT_NOSHARE;
-                        break;
-                case Opt_dirstat:
-                        args->flags |= CEPH_OPT_DIRSTAT;
-                        break;
-                case Opt_nodirstat:
-                        args->flags &= ~CEPH_OPT_DIRSTAT;
-                        break;
-                case Opt_rbytes:
-                        args->flags |= CEPH_OPT_RBYTES;
-                        break;
-                case Opt_norbytes:
-                        args->flags &= ~CEPH_OPT_RBYTES;
-                        break;
-                case Opt_nocrc:
-                        args->flags |= CEPH_OPT_NOCRC;
-                        break;
-                case Opt_noasyncreaddir:
-                        args->flags |= CEPH_OPT_NOASYNCREADDIR;
-                        break;
-                default:
-                        BUG_ON(token);
-                }
-        }
-        return args;
 out:
-        kfree(args->mon_addr);
+        destroy_mount_options(fsopt);
-        kfree(args);
+        return err;
-        return ERR_PTR(err);
 }
-static void destroy_mount_args(struct ceph_mount_args *args)
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
-        dout("destroy_mount_args %p\n", args);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
-        kfree(args->snapdir_name);
+        struct ceph_mount_options *fsopt = fsc->mount_options;
-        args->snapdir_name = NULL;
+        struct ceph_options *opt = fsc->client->options;
-        kfree(args->name);
-        args->name = NULL;
+        if (opt->flags & CEPH_OPT_FSID)
-        kfree(args->secret);
+                seq_printf(m, ",fsid=%pU", &opt->fsid);
-        args->secret = NULL;
+        if (opt->flags & CEPH_OPT_NOSHARE)
-        kfree(args);
+                seq_puts(m, ",noshare");
+        if (opt->flags & CEPH_OPT_NOCRC)
+                seq_puts(m, ",nocrc");
+        if (opt->name)
+                seq_printf(m, ",name=%s", opt->name);
+        if (opt->secret)
+                seq_puts(m, ",secret=<hidden>");
+        if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+                seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+        if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+                seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+        if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+                seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+        if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+                seq_printf(m, ",osdkeepalivetimeout=%d",
+                           opt->osd_keepalive_timeout);
+        if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+                seq_puts(m, ",dirstat");
+        if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+                seq_puts(m, ",norbytes");
+        if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+                seq_puts(m, ",noasyncreaddir");
+        if (fsopt->wsize)
+                seq_printf(m, ",wsize=%d", fsopt->wsize);
+        if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+                seq_printf(m, ",rsize=%d", fsopt->rsize);
+        if (fsopt->congestion_kb != default_congestion_kb())
+                seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+        if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_min=%d",
+                         fsopt->caps_wanted_delay_min);
+        if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_max=%d",
+                           fsopt->caps_wanted_delay_max);
+        if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+                seq_printf(m, ",cap_release_safety=%d",
+                           fsopt->cap_release_safety);
+        if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+                seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+        if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+                seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+        if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+                seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+        return 0;
 }
 /*
- * create a fresh client instance
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
 */
-static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 {
-        struct ceph_client *client;
+        struct ceph_fs_client *fsc = client->private;
+        int type = le16_to_cpu(msg->hdr.type);
+        switch (type) {
+        case CEPH_MSG_MDS_MAP:
+                ceph_mdsc_handle_map(fsc->mdsc, msg);
+                return 0;
+        default:
+                return -1;
+        }
+}
+/*
+ * create a new fs client
+ */
+struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+                                        struct ceph_options *opt)
+{
+        struct ceph_fs_client *fsc;
        int err = -ENOMEM;
-        client = kzalloc(sizeof(*client), GFP_KERNEL);
+        fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
-        if (client == NULL)
+        if (!fsc)
                return ERR_PTR(-ENOMEM);
-        mutex_init(&client->mount_mutex);
+        fsc->client = ceph_create_client(opt, fsc);
+        if (IS_ERR(fsc->client)) {
-        init_waitqueue_head(&client->auth_wq);
+                err = PTR_ERR(fsc->client);
+                goto fail;
+        }
+        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+        fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
+                CEPH_FEATURE_DIRLAYOUTHASH;
+        fsc->client->monc.want_mdsmap = 1;
-        client->sb = NULL;
+        fsc->mount_options = fsopt;
-        client->mount_state = CEPH_MOUNT_MOUNTING;
-        client->mount_args = args;
-        client->msgr = NULL;
+        fsc->sb = NULL;
+        fsc->mount_state = CEPH_MOUNT_MOUNTING;
-        client->auth_err = 0;
+        atomic_long_set(&fsc->writeback_count, 0);
-        atomic_long_set(&client->writeback_count, 0);
-        err = bdi_init(&client->backing_dev_info);
+        err = bdi_init(&fsc->backing_dev_info);
        if (err < 0)
-                goto fail;
+                goto fail_client;
        err = -ENOMEM;
-        client->wb_wq = create_workqueue("ceph-writeback");
+        /*
-        if (client->wb_wq == NULL)
+         * The number of concurrent works can be high but they don't need
+         * to be processed in parallel, limit concurrency.
+         */
+        fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
+        if (fsc->wb_wq == NULL)
                goto fail_bdi;
-        client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+        fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
-        if (client->pg_inv_wq == NULL)
+        if (fsc->pg_inv_wq == NULL)
                goto fail_wb_wq;
-        client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+        fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
-        if (client->trunc_wq == NULL)
+        if (fsc->trunc_wq == NULL)
                goto fail_pg_inv_wq;
        /* set up mempools */
        err = -ENOMEM;
-        client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+        fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-                              client->mount_args->wsize >> PAGE_CACHE_SHIFT);
+                              fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
-        if (!client->wb_pagevec_pool)
+        if (!fsc->wb_pagevec_pool)
                goto fail_trunc_wq;
        /* caps */
-        client->min_caps = args->max_readdir;
+        fsc->min_caps = fsopt->max_readdir;
+        return fsc;
-        /* subsystems */
-        err = ceph_monc_init(&client->monc, client);
-        if (err < 0)
-                goto fail_mempool;
-        err = ceph_osdc_init(&client->osdc, client);
-        if (err < 0)
-                goto fail_monc;
-        err = ceph_mdsc_init(&client->mdsc, client);
-        if (err < 0)
-                goto fail_osdc;
-        return client;
-fail_osdc:
-        ceph_osdc_stop(&client->osdc);
-fail_monc:
-        ceph_monc_stop(&client->monc);
-fail_mempool:
-        mempool_destroy(client->wb_pagevec_pool);
 fail_trunc_wq:
-        destroy_workqueue(client->trunc_wq);
+        destroy_workqueue(fsc->trunc_wq);
 fail_pg_inv_wq:
-        destroy_workqueue(client->pg_inv_wq);
+        destroy_workqueue(fsc->pg_inv_wq);
 fail_wb_wq:
-        destroy_workqueue(client->wb_wq);
+        destroy_workqueue(fsc->wb_wq);
 fail_bdi:
-        bdi_destroy(&client->backing_dev_info);
+        bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+        ceph_destroy_client(fsc->client);
 fail:
-        kfree(client);
+        kfree(fsc);
        return ERR_PTR(err);
 }
-static void ceph_destroy_client(struct ceph_client *client)
+void destroy_fs_client(struct ceph_fs_client *fsc)
 {
-        dout("destroy_client %p\n", client);
+        dout("destroy_fs_client %p\n", fsc);
-        /* unmount */
+        destroy_workqueue(fsc->wb_wq);
-        ceph_mdsc_stop(&client->mdsc);
+        destroy_workqueue(fsc->pg_inv_wq);
-        ceph_osdc_stop(&client->osdc);
+        destroy_workqueue(fsc->trunc_wq);
-        /*
+        bdi_destroy(&fsc->backing_dev_info);
-         * make sure mds and osd connections close out before destroying
-         * the auth module, which is needed to free those connections'
-         * ceph_authorizers.
-         */
-        ceph_msgr_flush();
-        ceph_monc_stop(&client->monc);
-        ceph_debugfs_client_cleanup(client);
+        mempool_destroy(fsc->wb_pagevec_pool);
-        destroy_workqueue(client->wb_wq);
-        destroy_workqueue(client->pg_inv_wq);
-        destroy_workqueue(client->trunc_wq);
-        bdi_destroy(&client->backing_dev_info);
+        destroy_mount_options(fsc->mount_options);
-        if (client->msgr)
+        ceph_fs_debugfs_cleanup(fsc);
-                ceph_messenger_destroy(client->msgr);
-        mempool_destroy(client->wb_pagevec_pool);
-        destroy_mount_args(client->mount_args);
+        ceph_destroy_client(fsc->client);
-        kfree(client);
+        kfree(fsc);
-        dout("destroy_client %p done\n", client);
+        dout("destroy_fs_client %p done\n", fsc);
 }
 /*
- * Initially learn our fsid, or verify an fsid matches.
+ * caches
 */
-int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+static void ceph_inode_init_once(void *foo)
 {
-        if (client->have_fsid) {
+        struct ceph_inode_info *ci = foo;
-                if (ceph_fsid_compare(&client->fsid, fsid)) {
+        inode_init_once(&ci->vfs_inode);
-                        pr_err("bad fsid, had %pU got %pU",
+}
-                               &client->fsid, fsid);
-                        return -1;
+static int __init init_caches(void)
-                }
+{
-        } else {
+        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
-                pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
+                                      sizeof(struct ceph_inode_info),
-                        fsid);
+                                      __alignof__(struct ceph_inode_info),
-                memcpy(&client->fsid, fsid, sizeof(*fsid));
+                                      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-                ceph_debugfs_client_init(client);
+                                      ceph_inode_init_once);
-                client->have_fsid = true;
+        if (ceph_inode_cachep == NULL)
-        }
+                return -ENOMEM;
+        ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_cap_cachep == NULL)
+                goto bad_cap;
+        ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+                                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_dentry_cachep == NULL)
+                goto bad_dentry;
+        ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_file_cachep == NULL)
+                goto bad_file;
        return 0;
+bad_file:
+        kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+        kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+        kmem_cache_destroy(ceph_inode_cachep);
+        return -ENOMEM;
 }
+static void destroy_caches(void)
+{
+        kmem_cache_destroy(ceph_inode_cachep);
+        kmem_cache_destroy(ceph_cap_cachep);
+        kmem_cache_destroy(ceph_dentry_cachep);
+        kmem_cache_destroy(ceph_file_cachep);
+}
 /*
- * true if we have the mon map (and have thus joined the cluster)
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
 */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static void ceph_umount_begin(struct super_block *sb)
 {
-        return client->monc.monmap && client->monc.monmap->epoch &&
+        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
-               client->osdc.osdmap && client->osdc.osdmap->epoch;
+        dout("ceph_umount_begin - starting forced umount\n");
+        if (!fsc)
+                return;
+        fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+        return;
 }
+static const struct super_operations ceph_super_ops = {
+        .alloc_inode    = ceph_alloc_inode,
+        .destroy_inode  = ceph_destroy_inode,
+        .write_inode    = ceph_write_inode,
+        .sync_fs        = ceph_sync_fs,
+        .put_super      = ceph_put_super,
+        .show_options   = ceph_show_options,
+        .statfs         = ceph_statfs,
+        .umount_begin   = ceph_umount_begin,
+};
 /*
 * Bootstrap mount by opening the root directory.  Note the mount
 * @started time from caller, and time out if this takes too long.
 */
-static struct dentry *open_root_dentry(struct ceph_client *client,
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
                                       const char *path,
                                       unsigned long started)
 {
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req = NULL;
        int err;
        struct dentry *root;
@@ -784,14 +616,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
        req->r_ino1.ino = CEPH_INO_ROOT;
        req->r_ino1.snap = CEPH_NOSNAP;
        req->r_started = started;
-        req->r_timeout = client->mount_args->mount_timeout * HZ;
+        req->r_timeout = fsc->client->options->mount_timeout * HZ;
        req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
        req->r_num_caps = 2;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        if (err == 0) {
                dout("open_root_inode success\n");
                if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
-                    client->sb->s_root == NULL)
+                    fsc->sb->s_root == NULL)
                        root = d_alloc_root(req->r_target_inode);
                else
                        root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +636,84 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
        return root;
 }
 /*
 * mount: join the ceph cluster, and open root directory.
 */
-static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
                      const char *path)
 {
-        struct ceph_entity_addr *myaddr = NULL;
        int err;
-        unsigned long timeout = client->mount_args->mount_timeout * HZ;
        unsigned long started = jiffies;  /* note the start time */
        struct dentry *root;
+        int first = 0;   /* first vfsmount for this super_block */
        dout("mount start\n");
-        mutex_lock(&client->mount_mutex);
+        mutex_lock(&fsc->client->mount_mutex);
-        /* initialize the messenger */
-        if (client->msgr == NULL) {
-                if (ceph_test_opt(client, MYIP))
-                        myaddr = &client->mount_args->my_addr;
-                client->msgr = ceph_messenger_create(myaddr);
-                if (IS_ERR(client->msgr)) {
-                        err = PTR_ERR(client->msgr);
-                        client->msgr = NULL;
-                        goto out;
-                }
-                client->msgr->nocrc = ceph_test_opt(client, NOCRC);
-        }
-        /* open session, and wait for mon, mds, and osd maps */
+        err = __ceph_open_session(fsc->client, started);
-        err = ceph_monc_open_session(&client->monc);
        if (err < 0)
                goto out;
-        while (!have_mon_and_osd_map(client)) {
-                err = -EIO;
-                if (timeout && time_after_eq(jiffies, started + timeout))
-                        goto out;
-                /* wait */
-                dout("mount waiting for mon_map\n");
-                err = wait_event_interruptible_timeout(client->auth_wq,
-                       have_mon_and_osd_map(client) || (client->auth_err < 0),
-                       timeout);
-                if (err == -EINTR || err == -ERESTARTSYS)
-                        goto out;
-                if (client->auth_err < 0) {
-                        err = client->auth_err;
-                        goto out;
-                }
-        }
        dout("mount opening root\n");
-        root = open_root_dentry(client, "", started);
+        root = open_root_dentry(fsc, "", started);
        if (IS_ERR(root)) {
                err = PTR_ERR(root);
                goto out;
        }
-        if (client->sb->s_root)
+        if (fsc->sb->s_root) {
                dput(root);
-        else
+        } else {
-                client->sb->s_root = root;
+                fsc->sb->s_root = root;
+                first = 1;
+                err = ceph_fs_debugfs_init(fsc);
+                if (err < 0)
+                        goto fail;
+        }
        if (path[0] == 0) {
                dget(root);
        } else {
                dout("mount opening base mountpoint\n");
-                root = open_root_dentry(client, path, started);
+                root = open_root_dentry(fsc, path, started);
                if (IS_ERR(root)) {
                        err = PTR_ERR(root);
-                        dput(client->sb->s_root);
+                        goto fail;
-                        client->sb->s_root = NULL;
-                        goto out;
                }
        }
-        mnt->mnt_root = root;
+        fsc->mount_state = CEPH_MOUNT_MOUNTED;
-        mnt->mnt_sb = client->sb;
-        client->mount_state = CEPH_MOUNT_MOUNTED;
        dout("mount success\n");
-        err = 0;
+        mutex_unlock(&fsc->client->mount_mutex);
+        return root;
 out:
-        mutex_unlock(&client->mount_mutex);
+        mutex_unlock(&fsc->client->mount_mutex);
-        return err;
+        return ERR_PTR(err);
+fail:
+        if (first) {
+                dput(fsc->sb->s_root);
+                fsc->sb->s_root = NULL;
+        }
+        goto out;
 }
 static int ceph_set_super(struct super_block *s, void *data)
 {
-        struct ceph_client *client = data;
+        struct ceph_fs_client *fsc = data;
        int ret;
        dout("set_super %p data %p\n", s, data);
-        s->s_flags = client->mount_args->sb_flags;
+        s->s_flags = fsc->mount_options->sb_flags;
        s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
-        s->s_fs_info = client;
+        s->s_fs_info = fsc;
-        client->sb = s;
+        fsc->sb = s;
        s->s_op = &ceph_super_ops;
        s->s_export_op = &ceph_export_ops;
@@ -917,7 +728,7 @@ static int ceph_set_super(struct super_block *s, void *data)
 fail:
        s->s_fs_info = NULL;
-        client->sb = NULL;
+        fsc->sb = NULL;
        return ret;
 }
@@ -926,30 +737,23 @@ fail:
 */
 static int ceph_compare_super(struct super_block *sb, void *data)
 {
-        struct ceph_client *new = data;
+        struct ceph_fs_client *new = data;
-        struct ceph_mount_args *args = new->mount_args;
+        struct ceph_mount_options *fsopt = new->mount_options;
-        struct ceph_client *other = ceph_sb_to_client(sb);
+        struct ceph_options *opt = new->client->options;
-        int i;
+        struct ceph_fs_client *other = ceph_sb_to_client(sb);
        dout("ceph_compare_super %p\n", sb);
-        if (args->flags & CEPH_OPT_FSID) {
-                if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
+        if (compare_mount_options(fsopt, opt, other)) {
-                        dout("fsid doesn't match\n");
+                dout("monitor(s)/mount options don't match\n");
-                        return 0;
+                return 0;
-                }
+        }
-        } else {
+        if ((opt->flags & CEPH_OPT_FSID) &&
-                /* do we share (a) monitor? */
+            ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
-                for (i = 0; i < new->monc.monmap->num_mon; i++)
+                dout("fsid doesn't match\n");
-                        if (ceph_monmap_contains(other->monc.monmap,
+                return 0;
-                                         &new->monc.monmap->mon_inst[i].addr))
-                                break;
-                if (i == new->monc.monmap->num_mon) {
-                        dout("mon ip not part of monmap\n");
-                        return 0;
-                }
-                dout("mon ip matches existing sb %p\n", sb);
        }
-        if (args->sb_flags != other->mount_args->sb_flags) {
+        if (fsopt->sb_flags != other->mount_options->sb_flags) {
                dout("flags differ\n");
                return 0;
        }
@@ -961,98 +765,113 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 */
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+static int ceph_register_bdi(struct super_block *sb,
+                             struct ceph_fs_client *fsc)
 {
        int err;
        /* set ra_pages based on rsize mount option? */
-        if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
+        if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
-                client->backing_dev_info.ra_pages =
+                fsc->backing_dev_info.ra_pages =
-                        (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+                        (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
                        >> PAGE_SHIFT;
-        err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
                           atomic_long_inc_return(&bdi_seq));
        if (!err)
-                sb->s_bdi = &client->backing_dev_info;
+                sb->s_bdi = &fsc->backing_dev_info;
        return err;
 }
-static int ceph_get_sb(struct file_system_type *fs_type,
+static struct dentry *ceph_mount(struct file_system_type *fs_type,
-                       int flags, const char *dev_name, void *data,
+                       int flags, const char *dev_name, void *data)
-                       struct vfsmount *mnt)
 {
        struct super_block *sb;
-        struct ceph_client *client;
+        struct ceph_fs_client *fsc;
+        struct dentry *res;
        int err;
        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
        const char *path = NULL;
-        struct ceph_mount_args *args;
+        struct ceph_mount_options *fsopt = NULL;
+        struct ceph_options *opt = NULL;
-        dout("ceph_get_sb\n");
+        dout("ceph_mount\n");
-        args = parse_mount_args(flags, data, dev_name, &path);
+        err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
-        if (IS_ERR(args)) {
+        if (err < 0) {
-                err = PTR_ERR(args);
+                res = ERR_PTR(err);
                goto out_final;
        }
        /* create client (which we may/may not use) */
-        client = ceph_create_client(args);
+        fsc = create_fs_client(fsopt, opt);
-        if (IS_ERR(client)) {
+        if (IS_ERR(fsc)) {
-                err = PTR_ERR(client);
+                res = ERR_CAST(fsc);
+                kfree(fsopt);
+                kfree(opt);
                goto out_final;
        }
-        if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+        err = ceph_mdsc_init(fsc);
+        if (err < 0) {
+                res = ERR_PTR(err);
+                goto out;
+        }
+        if (ceph_test_opt(fsc->client, NOSHARE))
                compare_super = NULL;
-        sb = sget(fs_type, compare_super, ceph_set_super, client);
+        sb = sget(fs_type, compare_super, ceph_set_super, fsc);
        if (IS_ERR(sb)) {
-                err = PTR_ERR(sb);
+                res = ERR_CAST(sb);
                goto out;
        }
-        if (ceph_sb_to_client(sb) != client) {
+        if (ceph_sb_to_client(sb) != fsc) {
-                ceph_destroy_client(client);
+                ceph_mdsc_destroy(fsc);
-                client = ceph_sb_to_client(sb);
+                destroy_fs_client(fsc);
-                dout("get_sb got existing client %p\n", client);
+                fsc = ceph_sb_to_client(sb);
+                dout("get_sb got existing client %p\n", fsc);
        } else {
-                dout("get_sb using new client %p\n", client);
+                dout("get_sb using new client %p\n", fsc);
-                err = ceph_register_bdi(sb, client);
+                err = ceph_register_bdi(sb, fsc);
-                if (err < 0)
+                if (err < 0) {
+                        res = ERR_PTR(err);
                        goto out_splat;
+                }
        }
-        err = ceph_mount(client, mnt, path);
+        res = ceph_real_mount(fsc, path);
-        if (err < 0)
+        if (IS_ERR(res))
                goto out_splat;
-        dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+        dout("root %p inode %p ino %llx.%llx\n", res,
-             mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+             res->d_inode, ceph_vinop(res->d_inode));
-        return 0;
+        return res;
 out_splat:
-        ceph_mdsc_close_sessions(&client->mdsc);
+        ceph_mdsc_close_sessions(fsc->mdsc);
        deactivate_locked_super(sb);
        goto out_final;
 out:
-        ceph_destroy_client(client);
+        ceph_mdsc_destroy(fsc);
+        destroy_fs_client(fsc);
 out_final:
-        dout("ceph_get_sb fail %d\n", err);
+        dout("ceph_mount fail %ld\n", PTR_ERR(res));
-        return err;
+        return res;
 }
 static void ceph_kill_sb(struct super_block *s)
 {
-        struct ceph_client *client = ceph_sb_to_client(s);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(s);
        dout("kill_sb %p\n", s);
-        ceph_mdsc_pre_umount(&client->mdsc);
+        ceph_mdsc_pre_umount(fsc->mdsc);
        kill_anon_super(s);    /* will call put_super after sb is r/o */
-        ceph_destroy_client(client);
+        ceph_mdsc_destroy(fsc);
+        destroy_fs_client(fsc);
 }
 static struct file_system_type ceph_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ceph",
-        .get_sb         = ceph_get_sb,
+        .mount          = ceph_mount,
        .kill_sb        = ceph_kill_sb,
        .fs_flags       = FS_RENAME_DOES_D_MOVE,
 };
@@ -1062,36 +881,20 @@ static struct file_system_type ceph_fs_type = {
 static int __init init_ceph(void)
 {
-        int ret = 0;
+        int ret = init_caches();
-        ret = ceph_debugfs_init();
-        if (ret < 0)
-                goto out;
-        ret = ceph_msgr_init();
-        if (ret < 0)
-                goto out_debugfs;
-        ret = init_caches();
        if (ret)
-                goto out_msgr;
+                goto out;
        ret = register_filesystem(&ceph_fs_type);
        if (ret)
                goto out_icache;
-        pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
+        pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
-                CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-                CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
-                CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
        return 0;
 out_icache:
        destroy_caches();
-out_msgr:
-        ceph_msgr_exit();
-out_debugfs:
-        ceph_debugfs_cleanup();
 out:
        return ret;
 }
@@ -1101,8 +904,6 @@ static void __exit exit_ceph(void)
        dout("exit_ceph\n");
        unregister_filesystem(&ceph_fs_type);
        destroy_caches();
-        ceph_msgr_exit();
-        ceph_debugfs_cleanup();
 }
 module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b87638e84c4b..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
 #ifndef _FS_CEPH_SUPER_H
 #define _FS_CEPH_SUPER_H
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <asm/unaligned.h>
 #include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
 #include <linux/writeback.h>
 #include <linux/slab.h>
-#include "types.h"
+#include <linux/ceph/libceph.h>
-#include "messenger.h"
-#include "msgpool.h"
-#include "mon_client.h"
-#include "mds_client.h"
-#include "osd_client.h"
-#include "ceph_fs.h"
 /* f_type in struct statfs */
 #define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
 #define CEPH_BLOCK_SHIFT   20  /* 1 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
-/*
+#define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
- * Supported features
+#define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
- */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
-#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
-#define CEPH_FEATURE_REQUIRED  CEPH_FEATURE_NOSRCADDR
-/*
+#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
- * mount options
- */
-#define CEPH_OPT_FSID             (1<<0)
-#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
-#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
-#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
-#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
-#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
-#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+#define ceph_set_mount_opt(fsc, opt) \
+        (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+        (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
-#define ceph_set_opt(client, opt) \
+#define CEPH_MAX_READDIR_DEFAULT        1024
-        (client)->mount_args->flags |= CEPH_OPT_##opt;
+#define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
-#define ceph_test_opt(client, opt) \
+#define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
-        (!!((client)->mount_args->flags & CEPH_OPT_##opt))
+struct ceph_mount_options {
-struct ceph_mount_args {
-        int sb_flags;
        int flags;
-        struct ceph_fsid fsid;
+        int sb_flags;
-        struct ceph_entity_addr my_addr;
-        int num_mon;
-        struct ceph_entity_addr *mon_addr;
-        int mount_timeout;
-        int osd_idle_ttl;
-        int osd_timeout;
-        int osd_keepalive_timeout;
        int wsize;
        int rsize;            /* max readahead */
        int congestion_kb;    /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
        int cap_release_safety;
        int max_readdir;       /* max readdir result (entires) */
        int max_readdir_bytes; /* max readdir result (bytes) */
-        char *snapdir_name;   /* default ".snap" */
-        char *name;
-        char *secret;
-};
-/*
+        /*
- * defaults
+         * everything above this point can be memcmp'd; everything below
- */
+         * is handled in compare_mount_options()
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+         */
-#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
-#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
-#define CEPH_MAX_READDIR_DEFAULT    1024
-#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
-#define CEPH_MSG_MAX_FRONT_LEN  (16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN   (16*1024*1024)
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-#define CEPH_AUTH_NAME_DEFAULT   "guest"
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file.  Delay a minimum amount of time, even if we send a cap
- * message for some other reason.  Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
-/* mount state */
-enum {
-        CEPH_MOUNT_MOUNTING,
-        CEPH_MOUNT_MOUNTED,
-        CEPH_MOUNT_UNMOUNTING,
-        CEPH_MOUNT_UNMOUNTED,
-        CEPH_MOUNT_SHUTDOWN,
-};
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
-        BUG_ON(time_after(b, a));
-        return (long)a - (long)b;
-}
-/*
- * per-filesystem client state
- *
- * possibly shared by multiple mount points, if they are
- * mounting the same ceph filesystem/cluster.
- */
-struct ceph_client {
-        struct ceph_fsid fsid;
-        bool have_fsid;
-        struct mutex mount_mutex;       /* serialize mount attempts */
+        char *snapdir_name;   /* default ".snap" */
-        struct ceph_mount_args *mount_args;
+};
+struct ceph_fs_client {
        struct super_block *sb;
-        unsigned long mount_state;
+        struct ceph_mount_options *mount_options;
-        wait_queue_head_t auth_wq;
+        struct ceph_client *client;
-        int auth_err;
+        unsigned long mount_state;
        int min_caps;                  /* min caps i added */
-        struct ceph_messenger *msgr;   /* messenger instance */
+        struct ceph_mds_client *mdsc;
-        struct ceph_mon_client monc;
-        struct ceph_mds_client mdsc;
-        struct ceph_osd_client osdc;
        /* writeback */
        mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
        struct backing_dev_info backing_dev_info;
 #ifdef CONFIG_DEBUG_FS
-        struct dentry *debugfs_monmap;
+        struct dentry *debugfs_dentry_lru, *debugfs_caps;
-        struct dentry *debugfs_mdsmap, *debugfs_osdmap;
-        struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
        struct dentry *debugfs_congestion_kb;
        struct dentry *debugfs_bdi;
+        struct dentry *debugfs_mdsc, *debugfs_mdsmap;
 #endif
 };
 /*
 * File i/o capability.  This tracks shared state with the metadata
 * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
        int should_free_val;
 };
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+        struct ceph_mds_session *lease_session;
+        u32 lease_gen, lease_shared_gen;
+        u32 lease_seq;
+        unsigned long lease_renew_after, lease_renew_from;
+        struct list_head lru;
+        struct dentry *dentry;
+        u64 time;
+        u64 offset;
+};
 struct ceph_inode_xattrs_info {
        /*
         * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
 /*
 * Ceph inode.
 */
-#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
-#define CEPH_I_NODELAY   4  /* do not delay cap release */
-#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
 struct ceph_inode_info {
        struct ceph_vino i_vino;   /* ceph ino + snap */
@@ -310,6 +239,7 @@ struct ceph_inode_info {
        unsigned i_ceph_flags;
        unsigned long i_release_count;
+        struct ceph_dir_layout i_dir_layout;
        struct ceph_file_layout i_layout;
        char *i_symlink;
@@ -364,9 +294,7 @@ struct ceph_inode_info {
        int i_rd_ref, i_rdcache_ref, i_wr_ref;
        int i_wrbuffer_ref, i_wrbuffer_ref_head;
        u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
-        u32 i_rdcache_gen;      /* we increment this each time we get
+        u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
-                                   FILE_CACHE.  If it's non-zero, we
-                                   _may_ have cached pages. */
        u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
        struct list_head i_unsafe_writes; /* uncommitted sync writes */
@@ -391,6 +319,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
        return container_of(inode, struct ceph_inode_info, vfs_inode);
 }
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino;
+}
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+        ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+        ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+        if (!ino)
+                ino = 1;
+#endif
+        return ino;
+}
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+static inline u64 ceph_ino(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino.snap;
+}
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+        struct ceph_vino *pvino = (struct ceph_vino *)data;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        return ci->i_vino.ino == pvino->ino &&
+                ci->i_vino.snap == pvino->snap;
+}
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+                                            struct ceph_vino vino)
+{
+        ino_t t = ceph_vino_to_ino(vino);
+        return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
 static inline void ceph_i_clear(struct inode *inode, unsigned mask)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +399,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
        struct ceph_inode_info *ci = ceph_inode(inode);
        bool r;
-        smp_mb();
+        spin_lock(&inode->i_lock);
        r = (ci->i_ceph_flags & mask) == mask;
+        spin_unlock(&inode->i_lock);
        return r;
 }
@@ -432,20 +418,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
                            struct ceph_inode_frag *pfrag,
                            int *found);
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
-        struct ceph_mds_session *lease_session;
-        u32 lease_gen, lease_shared_gen;
-        u32 lease_seq;
-        unsigned long lease_renew_after, lease_renew_from;
-        struct list_head lru;
-        struct dentry *dentry;
-        u64 time;
-        u64 offset;
-};
 static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
 {
        return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +428,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
        return ((loff_t)frag << 32) | (loff_t)off;
 }
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * don't include snap in ino hash, at least for now.
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
-        ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
-#if BITS_PER_LONG == 32
-        ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
-        if (!ino)
-                ino = 1;
-#endif
-        return ino;
-}
 static inline int ceph_set_ino_cb(struct inode *inode, void *data)
 {
        ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +435,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
        return 0;
 }
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
-        return ceph_inode(inode)->i_vino;
-}
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-static inline u64 ceph_ino(struct inode *inode)
-{
-        return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
-        return ceph_inode(inode)->i_vino.snap;
-}
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
-        struct ceph_vino *pvino = (struct ceph_vino *)data;
-        struct ceph_inode_info *ci = ceph_inode(inode);
-        return ci->i_vino.ino == pvino->ino &&
-                ci->i_vino.snap == pvino->snap;
-}
-static inline struct inode *ceph_find_inode(struct super_block *sb,
-                                            struct ceph_vino vino)
-{
-        ino_t t = ceph_vino_to_ino(vino);
-        return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
 /*
 * caps helpers
 */
@@ -576,18 +499,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
                             struct ceph_cap_reservation *ctx, int need);
 extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
                               struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_client *client,
+extern void ceph_reservation_status(struct ceph_fs_client *client,
                                    int *total, int *avail, int *used,
                                    int *reserved, int *min);
-static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
 {
-        return (struct ceph_client *)inode->i_sb->s_fs_info;
+        return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
 }
-static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
 {
-        return (struct ceph_client *)sb->s_fs_info;
+        return (struct ceph_fs_client *)sb->s_fs_info;
 }
@@ -617,51 +540,6 @@ struct ceph_file_info {
 /*
- * snapshots
- */
-/*
- * A "snap context" is the set of existing snapshots when we
- * write data.  It is used by the OSD to guide its COW behavior.
- *
- * The ceph_snap_context is refcounted, and attached to each dirty
- * page, indicating which context the dirty data belonged when it was
- * dirtied.
- */
-struct ceph_snap_context {
-        atomic_t nref;
-        u64 seq;
-        int num_snaps;
-        u64 snaps[];
-};
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
-        /*
-        printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-               atomic_read(&sc->nref)+1);
-        */
-        if (sc)
-                atomic_inc(&sc->nref);
-        return sc;
-}
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
-        if (!sc)
-                return;
-        /*
-        printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-               atomic_read(&sc->nref)-1);
-        */
-        if (atomic_dec_and_test(&sc->nref)) {
-                /*printk(" deleting snap_context %p\n", sc);*/
-                kfree(sc);
-        }
-}
-/*
 * A "snap realm" describes a subset of the file hierarchy sharing
 * the same set of snapshots that apply to it.  The realms themselves
 * are organized into a hierarchy, such that children inherit (some of)
@@ -699,16 +577,33 @@ struct ceph_snap_realm {
        spinlock_t inodes_with_caps_lock;
 };
+static inline int default_congestion_kb(void)
-/*
- * calculate the number of pages a given length and offset map onto,
- * if we align the data.
- */
-static inline int calc_pages_for(u64 off, u64 len)
 {
-        return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+        int congestion_kb;
-                (off >> PAGE_CACHE_SHIFT);
+        /*
+         * Copied from NFS
+         *
+         * congestion size, scale with available memory.
+         *
+         *  64MB:    8192k
+         * 128MB:   11585k
+         * 256MB:   16384k
+         * 512MB:   23170k
+         *   1GB:   32768k
+         *   2GB:   46340k
+         *   4GB:   65536k
+         *   8GB:   92681k
+         *  16GB:  131072k
+         *
+         * This allows larger machines to have larger/more transfers.
+         * Limit the default to 256M
+         */
+        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+        if (congestion_kb > 256*1024)
+                congestion_kb = 256*1024;
+        return congestion_kb;
 }
@@ -741,16 +636,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
                           ci_item)->writing;
 }
-/* super.c */
-extern struct kmem_cache *ceph_inode_cachep;
-extern struct kmem_cache *ceph_cap_cachep;
-extern struct kmem_cache *ceph_dentry_cachep;
-extern struct kmem_cache *ceph_file_cachep;
-extern const char *ceph_msg_type_name(int type);
-extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
 /* inode.c */
 extern const struct inode_operations ceph_file_iops;
@@ -781,7 +666,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
 extern void ceph_queue_writeback(struct inode *inode);
 extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
                        struct kstat *stat);
@@ -857,12 +742,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 extern const struct address_space_operations ceph_aops;
+extern int ceph_copy_to_page_vector(struct page **pages,
+                                    const char *data,
+                                    loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+                                    char *data,
+                                    loff_t off, size_t len);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
                                       struct nameidata *nd, int mode,
                                       int locked_dir);
 extern int ceph_release(struct inode *inode, struct file *filp);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
@@ -878,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct dentry *dn);
 /*
 * our d_ops vary depending on whether the inode is live,
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 /* export.c */
 extern const struct export_operations ceph_export_ops;
-/* debugfs.c */
-extern int ceph_debugfs_init(void);
-extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
-extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
 /* locks.c */
 extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
 extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
        return NULL;
 }
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
 #endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
deleted file mode 100644
index 28b35a005ec2..000000000000
--- a/fs/ceph/types.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _FS_CEPH_TYPES_H
-#define _FS_CEPH_TYPES_H
-/* needed before including ceph_fs.h */
-#include <linux/in.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/string.h>
-#include "ceph_fs.h"
-#include "ceph_frag.h"
-#include "ceph_hash.h"
-/*
- * Identify inodes by both their ino AND snapshot id (a u64).
- */
-struct ceph_vino {
-        u64 ino;
-        u64 snap;
-};
-/* context for the caps reservation mechanism */
-struct ceph_cap_reservation {
-        int count;
-};
-#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9578af610b73..8c9eba6ef9df 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
 #include <linux/xattr.h>
 #include <linux/slab.h>
@@ -216,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
        struct rb_node **p;
        struct rb_node *parent = NULL;
        struct ceph_inode_xattr *xattr = NULL;
+        int name_len = strlen(name);
        int c;
        p = &ci->i_xattrs.index.rb_node;
@@ -223,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
                parent = *p;
                xattr = rb_entry(parent, struct ceph_inode_xattr, node);
                c = strncmp(name, xattr->name, xattr->name_len);
+                if (c == 0 && name_len > xattr->name_len)
+                        c = 1;
                if (c < 0)
                        p = &(*p)->rb_left;
                else if (c > 0)
@@ -620,12 +626,12 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                              const char *value, size_t size, int flags)
 {
-        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct inode *parent_inode = dentry->d_parent->d_inode;
        struct ceph_mds_request *req;
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        int err;
        int i, nr_pages;
        struct page **pages = NULL;
@@ -713,10 +719,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
        /* preallocate memory for xattr name, value, index node */
        err = -ENOMEM;
-        newname = kmalloc(name_len + 1, GFP_NOFS);
+        newname = kmemdup(name, name_len + 1, GFP_NOFS);
        if (!newname)
                goto out;
-        memcpy(newname, name, name_len + 1);
        if (val_len) {
                newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +782,8 @@ out:
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = dentry->d_inode;
        struct inode *parent_inode = dentry->d_parent->d_inode;
        struct ceph_mds_request *req;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 143d393881cb..dca9e5e0f73b 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -59,7 +59,7 @@ static struct char_device_struct {
 } *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
 /* index in the above */
-static inline int major_to_index(int major)
+static inline int major_to_index(unsigned major)
 {
        return major % CHRDEV_MAJOR_HASH_SIZE;
 }
@@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
        return ret;
 }
-int cdev_index(struct inode *inode)
-{
-        int idx;
-        struct kobject *kobj;
-        kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
-        if (!kobj)
-                return -1;
-        kobject_put(kobj);
-        return idx;
-}
 void cd_forget(struct inode *inode)
 {
        spin_lock(&cdev_lock);
@@ -456,6 +444,7 @@ static void cdev_purge(struct cdev *cdev)
 */
 const struct file_operations def_chr_fops = {
        .open = chrdev_open,
+        .llseek = noop_llseek,
 };
 static struct kobject *exact_match(dev_t dev, int *part, void *data)
@@ -581,7 +570,6 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
-EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 917b7d449bb2..7cb0f7f847e4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,11 @@ config CIFS
        tristate "CIFS support (advanced network filesystem, SMBFS successor)"
        depends on INET
        select NLS
+        select CRYPTO
+        select CRYPTO_MD4
+        select CRYPTO_MD5
+        select CRYPTO_HMAC
+        select CRYPTO_ARC4
        help
          This is the client VFS module for the Common Internet File System
          (CIFS) protocol which is the successor to the Server Message Block
@@ -140,6 +145,13 @@ config CIFS_FSCACHE
            to be cached locally on disk through the general filesystem cache
            manager. If unsure, say N.
+config CIFS_ACL
+          bool "Provide CIFS ACL support (EXPERIMENTAL)"
+          depends on EXPERIMENTAL && CIFS_XATTR
+          help
+            Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
+            is handed over to the application/caller.
 config CIFS_EXPERIMENTAL
          bool "CIFS Experimental Features (EXPERIMENTAL)"
          depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index adefa60a9bdc..d87558448e3d 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,8 +5,10 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
          link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
+          cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
-          readdir.o ioctl.o sess.o export.o cifsacl.o
+          readdir.o ioctl.o sess.o export.o
+cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/README b/fs/cifs/README
index 7099a526f775..fe1683590828 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -337,6 +337,15 @@ A partial list of the supported mount options follows:
  wsize         default write size (default 57344)
                maximum wsize currently allowed by CIFS is 57344 (fourteen
                4096 byte pages)
+  actimeo=n     attribute cache timeout in seconds (default 1 second).
+                After this timeout, the cifs client requests fresh attribute
+                information from the server. This option allows to tune the
+                attribute cache timeout to suit the workload needs. Shorter
+                timeouts mean better the cache coherency, but increased number
+                of calls to the server. Longer timeouts mean reduced number
+                of calls to the server at the expense of less stricter cache
+                coherency checks (i.e. incorrect attribute cache for a short
+                period of time).
  rw            mount the network share read-write (note that the
                server may still consider the share read-only)
  ro            mount network share read-only
@@ -443,6 +452,11 @@ A partial list of the supported mount options follows:
                if oplock (caching token) is granted and held. Note that
                direct allows write operations larger than page size
                to be sent to the server.
+  strictcache   Use for switching on strict cache mode. In this mode the
+                client read from the cache all the time it has Oplock Level II,
+                otherwise - read from the server. All written data are stored
+                in the cache, but if the client doesn't have Exclusive Oplock,
+                it writes the data to the server.
  acl           Allow setfacl and getfacl to manage posix ACLs if server
                supports them.  (default)
  noacl         Do not allow setfacl and getfacl calls on this mount
@@ -527,6 +541,11 @@ A partial list of the supported mount options follows:
                SFU does).  In the future the bottom 9 bits of the
                mode also will be emulated using queries of the security
                descriptor (ACL).
+ mfsymlinks     Enable support for Minshall+French symlinks
+                (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
+                This option is ignored when specified together with the
+                'sfu' option. Minshall+French symlinks are used even if
+                the server supports the CIFS Unix Extensions.
 sign           Must use packet signing (helps avoid unwanted data modification
                by intermediate systems in the route).  Note that signing
                does not work with lanman or plaintext authentication.
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 5aff46c61e52..355abcdcda98 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
 v) mount check for unmatched uids
-w) Add support for new vfs entry points for setlease and fallocate 
+w) Add support for new vfs entry point for fallocate
 x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
 processes can proceed better in parallel (on the server)
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 224d7bbd1fcc..e654dfd092c3 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -64,7 +64,9 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
                                   void *buffer, uint16_t maxbuf)
 {
        const struct TCP_Server_Info *server = cookie_netfs_data;
-        const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
+        const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
+        const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
+        const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
        struct cifs_server_key *key = buffer;
        uint16_t key_len = sizeof(struct cifs_server_key);
@@ -76,16 +78,16 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
         */
        switch (sa->sa_family) {
        case AF_INET:
-                key->family = server->addr.sockAddr.sin_family;
+                key->family = sa->sa_family;
-                key->port = server->addr.sockAddr.sin_port;
+                key->port = addr->sin_port;
-                key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
+                key->addr[0].ipv4_addr = addr->sin_addr;
                key_len += sizeof(key->addr[0].ipv4_addr);
                break;
        case AF_INET6:
-                key->family = server->addr.sockAddr6.sin6_family;
+                key->family = sa->sa_family;
-                key->port = server->addr.sockAddr6.sin6_port;
+                key->port = addr6->sin6_port;
-                key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
+                key->addr[0].ipv6_addr = addr6->sin6_addr;
                key_len += sizeof(key->addr[0].ipv6_addr);
                break;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index eb1ba493489f..65829d32128c 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
        spin_lock(&GlobalMid_Lock);
        list_for_each(tmp, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+                cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
                        mid_entry->midState,
                        (int)mid_entry->command,
                        mid_entry->pid,
-                        mid_entry->tsk,
+                        mid_entry->callback_data,
                        mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
                cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
@@ -119,36 +119,34 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                    "Display Internal CIFS Data Structures for Debugging\n"
                    "---------------------------------------------------\n");
        seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
-        seq_printf(m, "Features: ");
+        seq_printf(m, "Features:");
 #ifdef CONFIG_CIFS_DFS_UPCALL
-        seq_printf(m, "dfs");
+        seq_printf(m, " dfs");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_FSCACHE
-        seq_printf(m, "fscache");
+        seq_printf(m, " fscache");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-        seq_printf(m, "lanman");
+        seq_printf(m, " lanman");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_POSIX
-        seq_printf(m, "posix");
+        seq_printf(m, " posix");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_UPCALL
-        seq_printf(m, "spnego");
+        seq_printf(m, " spnego");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_XATTR
-        seq_printf(m, "xattr");
+        seq_printf(m, " xattr");
+#endif
+#ifdef CONFIG_CIFS_ACL
+        seq_printf(m, " acl");
 #endif
        seq_putc(m, '\n');
        seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
        seq_printf(m, "Servers:");
        i = 0;
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp1, &cifs_tcp_ses_list) {
                server = list_entry(tmp1, struct TCP_Server_Info,
                                    tcp_ses_list);
@@ -220,17 +218,17 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                                mid_entry = list_entry(tmp3, struct mid_q_entry,
                                        qhead);
                                seq_printf(m, "\tState: %d com: %d pid:"
-                                                " %d tsk: %p mid %d\n",
+                                                " %d cbdata: %p mid %d\n",
                                                mid_entry->midState,
                                                (int)mid_entry->command,
                                                mid_entry->pid,
-                                                mid_entry->tsk,
+                                                mid_entry->callback_data,
                                                mid_entry->mid);
                        }
                        spin_unlock(&GlobalMid_Lock);
                }
        }
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        seq_putc(m, '\n');
        /* BB add code to dump additional info such as TCP session info now */
@@ -270,7 +268,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                atomic_set(&totBufAllocCount, 0);
                atomic_set(&totSmBufAllocCount, 0);
 #endif /* CONFIG_CIFS_STATS2 */
-                read_lock(&cifs_tcp_ses_lock);
+                spin_lock(&cifs_tcp_ses_lock);
                list_for_each(tmp1, &cifs_tcp_ses_list) {
                        server = list_entry(tmp1, struct TCP_Server_Info,
                                            tcp_ses_list);
@@ -303,7 +301,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                                }
                        }
                }
-                read_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
        }
        return count;
@@ -333,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                                atomic_read(&totSmBufAllocCount));
 #endif /* CONFIG_CIFS_STATS2 */
-        seq_printf(m, "Operations (MIDs): %d\n", midCount.counter);
+        seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
        seq_printf(m,
                "\n%d session %d share reconnects\n",
                tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
@@ -343,7 +341,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                GlobalCurrentXid, GlobalMaxActiveXid);
        i = 0;
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp1, &cifs_tcp_ses_list) {
                server = list_entry(tmp1, struct TCP_Server_Info,
                                    tcp_ses_list);
@@ -397,7 +395,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                        }
                }
        }
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        seq_putc(m, '\n');
        return 0;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index aa316891ac0c..8942b28cf807 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -34,7 +34,7 @@ void cifs_dump_mids(struct TCP_Server_Info *);
 extern int traceSMB;            /* flag which enables the function below */
 void dump_smb(struct smb_hdr *, int);
 #define CIFS_INFO       0x01
-#define CIFS_RC         0x02
+#define CIFS_RC         0x02
 #define CIFS_TIMER      0x04
 /*
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d6ced7aa23cf..0a265ad9e426 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -44,8 +44,7 @@ static void cifs_dfs_expire_automounts(struct work_struct *work)
 void cifs_dfs_release_automount_timer(void)
 {
        BUG_ON(!list_empty(&cifs_dfs_automount_list));
-        cancel_delayed_work(&cifs_dfs_automount_task);
+        cancel_delayed_work_sync(&cifs_dfs_automount_task);
-        flush_scheduled_work();
 }
 /**
@@ -256,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 }
-static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
-                                struct list_head *mntlist)
-{
-        /* stolen from afs code */
-        int err;
-        mntget(newmnt);
-        err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
-        switch (err) {
-        case 0:
-                path_put(&nd->path);
-                nd->path.mnt = newmnt;
-                nd->path.dentry = dget(newmnt->mnt_root);
-                schedule_delayed_work(&cifs_dfs_automount_task,
-                                      cifs_dfs_mountpoint_expiry_timeout);
-                break;
-        case -EBUSY:
-                /* someone else made a mount here whilst we were busy */
-                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path))
-                        ;
-                err = 0;
-        default:
-                mntput(newmnt);
-                break;
-        }
-        return err;
-}
 static void dump_referral(const struct dfs_info3_param *ref)
 {
        cFYI(1, "DFS: ref path: %s", ref->path_name);
@@ -294,34 +264,23 @@ static void dump_referral(const struct dfs_info3_param *ref)
                                ref->path_consumed);
 }
+/*
-static void*
+ * Create a vfsmount that we can automount
-cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+ */
+static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 {
        struct dfs_info3_param *referrals = NULL;
        unsigned int num_referrals = 0;
        struct cifs_sb_info *cifs_sb;
        struct cifsSesInfo *ses;
-        char *full_path = NULL;
+        char *full_path;
        int xid, i;
-        int rc = 0;
+        int rc;
-        struct vfsmount *mnt = ERR_PTR(-ENOENT);
+        struct vfsmount *mnt;
+        struct tcon_link *tlink;
        cFYI(1, "in %s", __func__);
-        BUG_ON(IS_ROOT(dentry));
+        BUG_ON(IS_ROOT(mntpt));
-        xid = GetXid();
-        dput(nd->path.dentry);
-        nd->path.dentry = dget(dentry);
-        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-        ses = cifs_sb->tcon->ses;
-        if (!ses) {
-                rc = -EINVAL;
-                goto out_err;
-        }
        /*
         * The MSDFS spec states that paths in DFS referral requests and
@@ -329,56 +288,83 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
         * the double backslashes usually used in the UNC. This function
         * gives us the latter, so we must adjust the result.
         */
-        full_path = build_path_from_dentry(dentry);
+        mnt = ERR_PTR(-ENOMEM);
-        if (full_path == NULL) {
+        full_path = build_path_from_dentry(mntpt);
-                rc = -ENOMEM;
+        if (full_path == NULL)
-                goto out_err;
+                goto cdda_exit;
+        cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                mnt = ERR_CAST(tlink);
+                goto free_full_path;
        }
+        ses = tlink_tcon(tlink)->ses;
-        rc = get_dfs_path(xid, ses , full_path + 1, cifs_sb->local_nls,
+        xid = GetXid();
+        rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
                &num_referrals, &referrals,
                cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        FreeXid(xid);
+        cifs_put_tlink(tlink);
+        mnt = ERR_PTR(-ENOENT);
        for (i = 0; i < num_referrals; i++) {
                int len;
-                dump_referral(referrals+i);
+                dump_referral(referrals + i);
                /* connect to a node */
                len = strlen(referrals[i].node_name);
                if (len < 2) {
                        cERROR(1, "%s: Net Address path too short: %s",
                                        __func__, referrals[i].node_name);
-                        rc = -EINVAL;
+                        mnt = ERR_PTR(-EINVAL);
-                        goto out_err;
+                        break;
                }
                mnt = cifs_dfs_do_refmount(cifs_sb,
                                full_path, referrals + i);
                cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
                                        referrals[i].node_name, mnt);
-                /* complete mount procedure if we accured submount */
                if (!IS_ERR(mnt))
-                        break;
+                        goto success;
        }
-        /* we need it cause for() above could exit without valid submount */
+        /* no valid submounts were found; return error from get_dfs_path() by
-        rc = PTR_ERR(mnt);
+         * preference */
-        if (IS_ERR(mnt))
+        if (rc != 0)
-                goto out_err;
+                mnt = ERR_PTR(rc);
-        rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
+success:
-out:
-        FreeXid(xid);
        free_dfs_info_array(referrals, num_referrals);
+free_full_path:
        kfree(full_path);
+cdda_exit:
        cFYI(1, "leaving %s" , __func__);
-        return ERR_PTR(rc);
+        return mnt;
-out_err:
+}
-        path_put(&nd->path);
-        goto out;
+/*
+ * Attempt to automount the referral
+ */
+struct vfsmount *cifs_dfs_d_automount(struct path *path)
+{
+        struct vfsmount *newmnt;
+        cFYI(1, "in %s", __func__);
+        newmnt = cifs_dfs_do_automount(path->dentry);
+        if (IS_ERR(newmnt)) {
+                cFYI(1, "leaving %s [automount failed]" , __func__);
+                return newmnt;
+        }
+        mntget(newmnt); /* prevent immediate expiration */
+        mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
+        schedule_delayed_work(&cifs_dfs_automount_task,
+                              cifs_dfs_mountpoint_expiry_timeout);
+        cFYI(1, "leaving %s [ok]" , __func__);
+        return newmnt;
 }
 const struct inode_operations cifs_dfs_referral_inode_operations = {
-        .follow_link = cifs_dfs_follow_mountpoint,
 };
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9e771450c3b8..ac51cd2d33ae 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,6 +15,8 @@
 *   the GNU Lesser General Public License for more details.
 *
 */
+#include <linux/rbtree.h>
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
@@ -36,23 +38,30 @@
 #define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
 #define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
 #define CIFS_MOUNT_FSCACHE      0x8000 /* local caching enabled */
+#define CIFS_MOUNT_MF_SYMLINKS  0x10000 /* Minshall+French Symlinks enabled */
+#define CIFS_MOUNT_MULTIUSER    0x20000 /* multiuser mount */
+#define CIFS_MOUNT_STRICT_IO    0x40000 /* strict cache mode */
 struct cifs_sb_info {
-        struct cifsTconInfo *tcon;      /* primary mount */
+        struct rb_root tlink_tree;
-        struct list_head nested_tcon_q;
+        spinlock_t tlink_tree_lock;
+        struct tcon_link *master_tlink;
        struct nls_table *local_nls;
        unsigned int rsize;
        unsigned int wsize;
+        unsigned long actimeo; /* attribute cache timeout (jiffies) */
+        atomic_t active;
        uid_t   mnt_uid;
        gid_t   mnt_gid;
        mode_t  mnt_file_mode;
        mode_t  mnt_dir_mode;
-        int     mnt_cifs_flags;
+        unsigned int mnt_cifs_flags;
        int     prepathlen;
        char   *prepath; /* relative path under the share to mount to */
 #ifdef CONFIG_CIFS_DFS_UPCALL
        char   *mountdata; /* mount options received at mount time */
 #endif
        struct backing_dev_info bdi;
+        struct delayed_work prune_tlinks;
 };
 #endif                          /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 87044906cd1f..4dfba8283165 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,6 +98,8 @@ struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 {
        struct TCP_Server_Info *server = sesInfo->server;
+        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
        char *description, *dp;
        size_t desc_len;
        struct key *spnego_key;
@@ -127,10 +129,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        /* add the server address */
-        if (server->addr.sockAddr.sin_family == AF_INET)
+        if (server->dstaddr.ss_family == AF_INET)
-                sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
+                sprintf(dp, "ip4=%pI4", &sa->sin_addr);
-        else if (server->addr.sockAddr.sin_family == AF_INET6)
+        else if (server->dstaddr.ss_family == AF_INET6)
-                sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
+                sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
        else
                goto out;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 430f510a1720..fc0fd4fde306 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,10 +44,14 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
        int charlen, outlen = 0;
        int maxwords = maxbytes / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
+        __u16 ftmp;
-        for (i = 0; i < maxwords && from[i]; i++) {
+        for (i = 0; i < maxwords; i++) {
-                charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
+                ftmp = get_unaligned_le16(&from[i]);
-                                             NLS_MAX_CHARSET_SIZE);
+                if (ftmp == 0)
+                        break;
+                charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
                if (charlen > 0)
                        outlen += charlen;
                else
@@ -58,9 +62,9 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 }
 /*
- * cifs_mapchar - convert a little-endian char to proper char in codepage
+ * cifs_mapchar - convert a host-endian char to proper char in codepage
 * @target - where converted character should be copied
- * @src_char - 2 byte little-endian source character
+ * @src_char - 2 byte host-endian source character
 * @cp - codepage to which character should be converted
 * @mapchar - should character be mapped according to mapchars mount option?
 *
@@ -69,7 +73,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
 */
 static int
-cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
+cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
             bool mapchar)
 {
        int len = 1;
@@ -82,7 +86,7 @@ cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
         *     build_path_from_dentry are modified, as they use slash as
         *     separator.
         */
-        switch (le16_to_cpu(src_char)) {
+        switch (src_char) {
        case UNI_COLON:
                *target = ':';
                break;
@@ -109,8 +113,7 @@ out:
        return len;
 cp_convert:
-        len = cp->uni2char(le16_to_cpu(src_char), target,
+        len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
-                           NLS_MAX_CHARSET_SIZE);
        if (len <= 0) {
                *target = '?';
                len = 1;
@@ -149,6 +152,7 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
        int nullsize = nls_nullsize(codepage);
        int fromwords = fromlen / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
+        __u16 ftmp;
        /*
         * because the chars can be of varying widths, we need to take care
@@ -158,19 +162,23 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
         */
        safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
-        for (i = 0; i < fromwords && from[i]; i++) {
+        for (i = 0; i < fromwords; i++) {
+                ftmp = get_unaligned_le16(&from[i]);
+                if (ftmp == 0)
+                        break;
                /*
                 * check to see if converting this character might make the
                 * conversion bleed into the null terminator
                 */
                if (outlen >= safelen) {
-                        charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
+                        charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
                        if ((outlen + charlen) > (tolen - nullsize))
                                break;
                }
                /* put converted char into 'to' buffer */
-                charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
+                charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
                outlen += charlen;
        }
@@ -193,24 +201,21 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 {
        int charlen;
        int i;
-        wchar_t *wchar_to = (wchar_t *)to; /* needed to quiet sparse */
+        wchar_t wchar_to; /* needed to quiet sparse */
        for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+                charlen = codepage->char2uni(from, len, &wchar_to);
-                /* works for 2.4.0 kernel or later */
-                charlen = codepage->char2uni(from, len, &wchar_to[i]);
                if (charlen < 1) {
-                        cERROR(1, "strtoUCS: char2uni of %d returned %d",
+                        cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
-                                (int)*from, charlen);
+                                *from, charlen);
                        /* A question mark */
-                        to[i] = cpu_to_le16(0x003f);
+                        wchar_to = 0x003f;
                        charlen = 1;
-                } else
+                }
-                        to[i] = cpu_to_le16(wchar_to[i]);
+                put_unaligned_le16(wchar_to, &to[i]);
        }
-        to[i] = 0;
+        put_unaligned_le16(0, &to[i]);
        return i;
 }
@@ -252,3 +257,79 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
        return dst;
 }
+/*
+ * Convert 16 bit Unicode pathname to wire format from string in current code
+ * page. Conversion may involve remapping up the six characters that are
+ * only legal in POSIX-like OS (if they are present in the string). Path
+ * names are little endian 16 bit Unicode on the wire
+ */
+int
+cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+                 const struct nls_table *cp, int mapChars)
+{
+        int i, j, charlen;
+        int len_remaining = maxlen;
+        char src_char;
+        __u16 temp;
+        if (!mapChars)
+                return cifs_strtoUCS(target, source, PATH_MAX, cp);
+        for (i = 0, j = 0; i < maxlen; j++) {
+                src_char = source[i];
+                switch (src_char) {
+                case 0:
+                        put_unaligned_le16(0, &target[j]);
+                        goto ctoUCS_out;
+                case ':':
+                        temp = UNI_COLON;
+                        break;
+                case '*':
+                        temp = UNI_ASTERIK;
+                        break;
+                case '?':
+                        temp = UNI_QUESTION;
+                        break;
+                case '<':
+                        temp = UNI_LESSTHAN;
+                        break;
+                case '>':
+                        temp = UNI_GRTRTHAN;
+                        break;
+                case '|':
+                        temp = UNI_PIPE;
+                        break;
+                /*
+                 * FIXME: We can not handle remapping backslash (UNI_SLASH)
+                 * until all the calls to build_path_from_dentry are modified,
+                 * as they use backslash as separator.
+                 */
+                default:
+                        charlen = cp->char2uni(source+i, len_remaining,
+                                                &temp);
+                        /*
+                         * if no match, use question mark, which at least in
+                         * some cases serves as wild card
+                         */
+                        if (charlen < 1) {
+                                temp = 0x003f;
+                                charlen = 1;
+                        }
+                        len_remaining -= charlen;
+                        /*
+                         * character may take more than one byte in the source
+                         * string, but will take exactly two bytes in the
+                         * target string
+                         */
+                        i += charlen;
+                        continue;
+                }
+                put_unaligned_le16(temp, &target[j]);
+                i++; /* move to next char in source string */
+                len_remaining--;
+        }
+ctoUCS_out:
+        return i;
+}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 85d7cf7ff2c8..beeebf194234 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -30,8 +30,6 @@
 #include "cifs_debug.h"
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
        {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
        {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
@@ -43,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 ;
-/* security id for everyone */
+/* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
        1, 1, {0, 0, 0, 0, 0, 1}, {0} };
+/* security id for Authenticated Users system group */
+static const struct cifs_sid sid_authusers = {
+        1, 1, {0, 0, 0, 0, 0, 5}, {11} };
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
@@ -367,10 +368,14 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        if (num_aces  > 0) {
                umode_t user_mask = S_IRWXU;
                umode_t group_mask = S_IRWXG;
-                umode_t other_mask = S_IRWXO;
+                umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
                ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
                                GFP_KERNEL);
+                if (!ppace) {
+                        cERROR(1, "DACL memory allocation error");
+                        return;
+                }
                for (i = 0; i < num_aces; ++i) {
                        ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
@@ -392,6 +397,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                                                     ppace[i]->type,
                                                     &fattr->cf_mode,
                                                     &other_mask);
+                        if (compare_sids(&(ppace[i]->sid), &sid_authusers))
+                                access_flags_to_mode(ppace[i]->access_req,
+                                                     ppace[i]->type,
+                                                     &fattr->cf_mode,
+                                                     &other_mask);
 /*                      memcpy((void *)(&(cifscred->aces[i])),
                                (void *)ppace[i],
@@ -557,13 +568,20 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 {
        struct cifs_ntsd *pntsd = NULL;
        int xid, rc;
+        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return ERR_CAST(tlink);
        xid = GetXid();
-        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+        rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
-        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
+        cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+        if (rc)
+                return ERR_PTR(rc);
        return pntsd;
 }
@@ -574,28 +592,34 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        int oplock = 0;
        int xid, rc;
        __u16 fid;
+        struct cifsTconInfo *tcon;
+        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return ERR_CAST(tlink);
+        tcon = tlink_tcon(tlink);
        xid = GetXid();
-        rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
+        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc) {
+        if (!rc) {
-                cERROR(1, "Unable to open file to get ACL");
+                rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
-                goto out;
+                CIFSSMBClose(xid, tcon, fid);
        }
-        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+        cifs_put_tlink(tlink);
-        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
-        CIFSSMBClose(xid, cifs_sb->tcon, fid);
- out:
        FreeXid(xid);
+        cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+        if (rc)
+                return ERR_PTR(rc);
        return pntsd;
 }
 /* Retrieve an ACL from the server */
-static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
                                      struct inode *inode, const char *path,
                                      u32 *pacllen)
 {
@@ -603,7 +627,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
        struct cifsFileInfo *open_file = NULL;
        if (inode)
-                open_file = find_readable_file(CIFS_I(inode));
+                open_file = find_readable_file(CIFS_I(inode), true);
        if (!open_file)
                return get_cifs_acl_by_path(cifs_sb, path, pacllen);
@@ -616,10 +640,15 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
                struct cifs_ntsd *pnntsd, u32 acllen)
 {
        int xid, rc;
+        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
        xid = GetXid();
-        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+        rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
        return rc;
@@ -631,10 +660,16 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
        int oplock = 0;
        int xid, rc;
        __u16 fid;
+        struct cifsTconInfo *tcon;
+        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        xid = GetXid();
-        rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
+        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
@@ -642,12 +677,13 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
                goto out;
        }
-        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+        rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
-        CIFSSMBClose(xid, cifs_sb->tcon, fid);
+        CIFSSMBClose(xid, tcon, fid);
- out:
+out:
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -661,7 +697,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
-        open_file = find_readable_file(CIFS_I(inode));
+        open_file = find_readable_file(CIFS_I(inode), true);
        if (!open_file)
                return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
@@ -671,7 +707,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void
+int
 cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
                  struct inode *inode, const char *path, const __u16 *pfid)
 {
@@ -687,17 +723,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
                pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
-        if (pntsd)
+        if (IS_ERR(pntsd)) {
+                rc = PTR_ERR(pntsd);
+                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+        } else {
                rc = parse_sec_desc(pntsd, acllen, fattr);
-        if (rc)
+                kfree(pntsd);
-                cFYI(1, "parse sec desc failed rc = %d", rc);
+                if (rc)
+                        cERROR(1, "parse sec desc failed rc = %d", rc);
+        }
-        kfree(pntsd);
+        return rc;
-        return;
 }
 /* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
+int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
 {
        int rc = 0;
        __u32 secdesclen = 0;
@@ -712,7 +752,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        /* Add three ACEs for owner, group, everyone getting rid of
           other ACEs as chmod disables ACEs and set the security descriptor */
-        if (pntsd) {
+        if (IS_ERR(pntsd)) {
+                rc = PTR_ERR(pntsd);
+                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+        } else {
                /* allocate memory for the smb header,
                   set security descriptor request security descriptor
                   parameters, and secuirty descriptor itself */
@@ -742,4 +785,3 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        return rc;
 }
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 6c8096cf5155..c4ae7d036563 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -74,11 +74,7 @@ struct cifs_wksid {
        char sidname[SIDNAMELENGTH];
 } __attribute__((packed));
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 extern int match_sid(struct cifs_sid *);
 extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
-#endif /*  CONFIG_CIFS_EXPERIMENTAL */
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 35042d8f7338..a51585f9852b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -24,9 +24,9 @@
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
-#include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
+#include "ntlmssp.h"
 #include <linux/ctype.h>
 #include <linux/random.h>
@@ -36,27 +36,37 @@
 /* Note that the smb header signature field on input contains the
        sequence number before this function is called */
-extern void mdfour(unsigned char *out, unsigned char *in, int n);
-extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
-                       unsigned char *p24);
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
-                                    const struct mac_key *key, char *signature)
+                                struct TCP_Server_Info *server, char *signature)
 {
-        struct  MD5Context context;
+        int rc;
-        if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
+        if (cifs_pdu == NULL || signature == NULL || server == NULL)
                return -EINVAL;
-        cifs_MD5_init(&context);
+        if (!server->secmech.sdescmd5) {
-        cifs_MD5_update(&context, (char *)&key->data, key->len);
+                cERROR(1, "%s: Can't generate signature\n", __func__);
-        cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+                return -1;
+        }
+        rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Oould not init md5\n", __func__);
+                return rc;
+        }
+        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                server->session_key.response, server->session_key.len);
+        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+        rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
-        cifs_MD5_final(signature, &context);
        return 0;
 }
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
                  __u32 *pexpected_response_sequence_number)
 {
@@ -69,17 +79,14 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
                return rc;
-        spin_lock(&GlobalMid_Lock);
        cifs_pdu->Signature.Sequence.SequenceNumber =
                        cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        spin_unlock(&GlobalMid_Lock);
-        rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key,
+        rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
-                                      smb_signature);
        if (rc)
                memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
        else
@@ -89,16 +96,28 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 }
 static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-                                const struct mac_key *key, char *signature)
+                                struct TCP_Server_Info *server, char *signature)
 {
-        struct  MD5Context context;
        int i;
+        int rc;
-        if ((iov == NULL) || (signature == NULL) || (key == NULL))
+        if (iov == NULL || signature == NULL || server == NULL)
                return -EINVAL;
-        cifs_MD5_init(&context);
+        if (!server->secmech.sdescmd5) {
-        cifs_MD5_update(&context, (char *)&key->data, key->len);
+                cERROR(1, "%s: Can't generate signature\n", __func__);
+                return -1;
+        }
+        rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Oould not init md5\n", __func__);
+                return rc;
+        }
+        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                server->session_key.response, server->session_key.len);
        for (i = 0; i < n_vec; i++) {
                if (iov[i].iov_len == 0)
                        continue;
@@ -111,18 +130,19 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
                if (i == 0) {
                        if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
                                break; /* nothing to sign or corrupt header */
-                        cifs_MD5_update(&context, iov[0].iov_base+4,
+                        crypto_shash_update(&server->secmech.sdescmd5->shash,
-                                  iov[0].iov_len-4);
+                                iov[i].iov_base + 4, iov[i].iov_len - 4);
                } else
-                        cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
+                        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                                iov[i].iov_base, iov[i].iov_len);
        }
-        cifs_MD5_final(signature, &context);
+        rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
-        return 0;
+        return rc;
 }
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
                   __u32 *pexpected_response_sequence_number)
 {
@@ -136,17 +156,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
                return rc;
-        spin_lock(&GlobalMid_Lock);
        cifs_pdu->Signature.Sequence.SequenceNumber =
                                cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        spin_unlock(&GlobalMid_Lock);
-        rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key,
+        rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
-                                      smb_signature);
        if (rc)
                memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
        else
@@ -156,14 +173,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 }
 int cifs_verify_signature(struct smb_hdr *cifs_pdu,
-                          const struct mac_key *mac_key,
+                          struct TCP_Server_Info *server,
                          __u32 expected_sequence_number)
 {
        unsigned int rc;
        char server_response_sig[8];
        char what_we_think_sig_should_be[20];
-        if ((cifs_pdu == NULL) || (mac_key == NULL))
+        if (cifs_pdu == NULL || server == NULL)
                return -EINVAL;
        if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +209,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
                                        cpu_to_le32(expected_sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
-        rc = cifs_calculate_signature(cifs_pdu, mac_key,
+        rc = cifs_calculate_signature(cifs_pdu, server,
                what_we_think_sig_should_be);
        if (rc)
@@ -208,19 +225,43 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 }
-/* We fill in key by putting in 40 byte array which was allocated by caller */
+/* first calculate 24 bytes ntlm response and then 16 byte session key */
-int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+int setup_ntlm_response(struct cifsSesInfo *ses)
-                           const char *password)
 {
-        char temp_key[16];
+        int rc = 0;
-        if ((key == NULL) || (rn == NULL))
+        unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
+        char temp_key[CIFS_SESS_KEY_SIZE];
+        if (!ses)
                return -EINVAL;
-        E_md4hash(password, temp_key);
+        ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
-        mdfour(key->data.ntlm, temp_key, 16);
+        if (!ses->auth_key.response) {
-        memcpy(key->data.ntlm+16, rn, CIFS_SESS_KEY_SIZE);
+                cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
-        key->len = 40;
+                return -ENOMEM;
-        return 0;
+        }
+        ses->auth_key.len = temp_len;
+        rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
+                        ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+        if (rc) {
+                cFYI(1, "%s Can't generate NTLM response, error: %d",
+                        __func__, rc);
+                return rc;
+        }
+        rc = E_md4hash(ses->password, temp_key);
+        if (rc) {
+                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+                return rc;
+        }
+        rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+        if (rc)
+                cFYI(1, "%s Can't generate NTLM session key, error: %d",
+                        __func__, rc);
+        return rc;
 }
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -262,109 +303,457 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 }
 #endif /* CIFS_WEAK_PW_HASH */
-static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
+/* Build a proper attribute value/target info pairs blob.
+ * Fill in netbios and dns domain name and workstation name
+ * and client time (total five av pairs and + one end of fields indicator.
+ * Allocate domain name which gets freed when session struct is deallocated.
+ */
+static int
+build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+{
+        unsigned int dlen;
+        unsigned int wlen;
+        unsigned int size = 6 * sizeof(struct ntlmssp2_name);
+        __le64  curtime;
+        char *defdmname = "WORKGROUP";
+        unsigned char *blobptr;
+        struct ntlmssp2_name *attrptr;
+        if (!ses->domainName) {
+                ses->domainName = kstrdup(defdmname, GFP_KERNEL);
+                if (!ses->domainName)
+                        return -ENOMEM;
+        }
+        dlen = strlen(ses->domainName);
+        wlen = strlen(ses->server->hostname);
+        /* The length of this blob is a size which is
+         * six times the size of a structure which holds name/size +
+         * two times the unicode length of a domain name +
+         * two times the unicode length of a server name +
+         * size of a timestamp (which is 8 bytes).
+         */
+        ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
+        ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
+        if (!ses->auth_key.response) {
+                ses->auth_key.len = 0;
+                cERROR(1, "Challenge target info allocation failure");
+                return -ENOMEM;
+        }
+        blobptr = ses->auth_key.response;
+        attrptr = (struct ntlmssp2_name *) blobptr;
+        attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
+        attrptr->length = cpu_to_le16(2 * dlen);
+        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+        cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+        blobptr += 2 * dlen;
+        attrptr = (struct ntlmssp2_name *) blobptr;
+        attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
+        attrptr->length = cpu_to_le16(2 * wlen);
+        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+        cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
+        blobptr += 2 * wlen;
+        attrptr = (struct ntlmssp2_name *) blobptr;
+        attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
+        attrptr->length = cpu_to_le16(2 * dlen);
+        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+        cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+        blobptr += 2 * dlen;
+        attrptr = (struct ntlmssp2_name *) blobptr;
+        attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
+        attrptr->length = cpu_to_le16(2 * wlen);
+        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+        cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
+        blobptr += 2 * wlen;
+        attrptr = (struct ntlmssp2_name *) blobptr;
+        attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
+        attrptr->length = cpu_to_le16(sizeof(__le64));
+        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+        curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+        memcpy(blobptr, &curtime, sizeof(__le64));
+        return 0;
+}
+/* Server has provided av pairs/target info in the type 2 challenge
+ * packet and we have plucked it and stored within smb session.
+ * We parse that blob here to find netbios domain name to be used
+ * as part of ntlmv2 authentication (in Target String), if not already
+ * specified on the command line.
+ * If this function returns without any error but without fetching
+ * domain name, authentication may fail against some server but
+ * may not fail against other (those who are not very particular
+ * about target string i.e. for some, just user name might suffice.
+ */
+static int
+find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+{
+        unsigned int attrsize;
+        unsigned int type;
+        unsigned int onesize = sizeof(struct ntlmssp2_name);
+        unsigned char *blobptr;
+        unsigned char *blobend;
+        struct ntlmssp2_name *attrptr;
+        if (!ses->auth_key.len || !ses->auth_key.response)
+                return 0;
+        blobptr = ses->auth_key.response;
+        blobend = blobptr + ses->auth_key.len;
+        while (blobptr + onesize < blobend) {
+                attrptr = (struct ntlmssp2_name *) blobptr;
+                type = le16_to_cpu(attrptr->type);
+                if (type == NTLMSSP_AV_EOL)
+                        break;
+                blobptr += 2; /* advance attr type */
+                attrsize = le16_to_cpu(attrptr->length);
+                blobptr += 2; /* advance attr size */
+                if (blobptr + attrsize > blobend)
+                        break;
+                if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
+                        if (!attrsize)
+                                break;
+                        if (!ses->domainName) {
+                                ses->domainName =
+                                        kmalloc(attrsize + 1, GFP_KERNEL);
+                                if (!ses->domainName)
+                                                return -ENOMEM;
+                                cifs_from_ucs2(ses->domainName,
+                                        (__le16 *)blobptr, attrsize, attrsize,
+                                        nls_cp, false);
+                                break;
+                        }
+                }
+                blobptr += attrsize; /* advance attr  value */
+        }
+        return 0;
+}
+static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
                            const struct nls_table *nls_cp)
 {
        int rc = 0;
        int len;
-        char nt_hash[16];
+        char nt_hash[CIFS_NTHASH_SIZE];
-        struct HMACMD5Context *pctxt;
        wchar_t *user;
        wchar_t *domain;
+        wchar_t *server;
-        pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
+        if (!ses->server->secmech.sdeschmacmd5) {
+                cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
-        if (pctxt == NULL)
+                return -1;
-                return -ENOMEM;
+        }
        /* calculate md4 hash of password */
        E_md4hash(ses->password, nt_hash);
-        /* convert Domainname to unicode and uppercase */
+        crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
-        hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
+                                CIFS_NTHASH_SIZE);
+        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
+                return rc;
+        }
        /* convert ses->userName to unicode and uppercase */
        len = strlen(ses->userName);
        user = kmalloc(2 + (len * 2), GFP_KERNEL);
-        if (user == NULL)
+        if (user == NULL) {
+                cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
+                rc = -ENOMEM;
                goto calc_exit_2;
+        }
        len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
        UniStrupr(user);
-        hmac_md5_update((char *)user, 2*len, pctxt);
+        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                                (char *)user, 2 * len);
        /* convert ses->domainName to unicode and uppercase */
        if (ses->domainName) {
                len = strlen(ses->domainName);
                domain = kmalloc(2 + (len * 2), GFP_KERNEL);
-                if (domain == NULL)
+                if (domain == NULL) {
+                        cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
+                        rc = -ENOMEM;
                        goto calc_exit_1;
+                }
                len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
                                        nls_cp);
-                /* the following line was removed since it didn't work well
+                crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
-                   with lower cased domain name that passed as an option.
+                                        (char *)domain, 2 * len);
-                   Maybe converting the domain name earlier makes sense */
-                /* UniStrupr(domain); */
-                hmac_md5_update((char *)domain, 2*len, pctxt);
                kfree(domain);
+        } else if (ses->serverName) {
+                len = strlen(ses->serverName);
+                server = kmalloc(2 + (len * 2), GFP_KERNEL);
+                if (server == NULL) {
+                        cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
+                        rc = -ENOMEM;
+                        goto calc_exit_1;
+                }
+                len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+                                        nls_cp);
+                crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                                        (char *)server, 2 * len);
+                kfree(server);
        }
+        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+                                        ntlmv2_hash);
 calc_exit_1:
        kfree(user);
 calc_exit_2:
-        /* BB FIXME what about bytes 24 through 40 of the signing key?
+        return rc;
-           compare with the NTLM example */
+}
-        hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
+static int
+CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
+{
+        int rc;
+        unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
+        if (!ses->server->secmech.sdeschmacmd5) {
+                cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+                return -1;
+        }
+        crypto_shash_setkey(ses->server->secmech.hmacmd5,
+                                ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+                return rc;
+        }
+        if (ses->server->secType == RawNTLMSSP)
+                memcpy(ses->auth_key.response + offset,
+                        ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+        else
+                memcpy(ses->auth_key.response + offset,
+                        ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response + offset, ses->auth_key.len - offset);
+        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response + CIFS_SESS_KEY_SIZE);
-        kfree(pctxt);
        return rc;
 }
-void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
-                      const struct nls_table *nls_cp)
+int
+setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 {
        int rc;
-        struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf;
+        int baselen;
-        struct HMACMD5Context context;
+        unsigned int tilen;
+        struct ntlmv2_resp *buf;
+        char ntlmv2_hash[16];
+        unsigned char *tiblob = NULL; /* target info blob */
+        if (ses->server->secType == RawNTLMSSP) {
+                if (!ses->domainName) {
+                        rc = find_domain_name(ses, nls_cp);
+                        if (rc) {
+                                cERROR(1, "error %d finding domain name", rc);
+                                goto setup_ntlmv2_rsp_ret;
+                        }
+                }
+        } else {
+                rc = build_avpair_blob(ses, nls_cp);
+                if (rc) {
+                        cERROR(1, "error %d building av pair blob", rc);
+                        goto setup_ntlmv2_rsp_ret;
+                }
+        }
+        baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
+        tilen = ses->auth_key.len;
+        tiblob = ses->auth_key.response;
+        ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
+        if (!ses->auth_key.response) {
+                rc = ENOMEM;
+                ses->auth_key.len = 0;
+                cERROR(1, "%s: Can't allocate auth blob", __func__);
+                goto setup_ntlmv2_rsp_ret;
+        }
+        ses->auth_key.len += baselen;
+        buf = (struct ntlmv2_resp *)
+                        (ses->auth_key.response + CIFS_SESS_KEY_SIZE);
        buf->blob_signature = cpu_to_le32(0x00000101);
        buf->reserved = 0;
        buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
        get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
        buf->reserved2 = 0;
-        buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
-        buf->names[0].length = 0;
-        buf->names[1].type = 0;
-        buf->names[1].length = 0;
-        /* calculate buf->ntlmv2_hash */
+        memcpy(ses->auth_key.response + baselen, tiblob, tilen);
-        rc = calc_ntlmv2_hash(ses, nls_cp);
-        if (rc)
+        /* calculate ntlmv2_hash */
+        rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
+        if (rc) {
                cERROR(1, "could not get v2 hash rc %d", rc);
-        CalcNTLMv2_response(ses, resp_buf);
+                goto setup_ntlmv2_rsp_ret;
+        }
+        /* calculate first part of the client response (CR1) */
+        rc = CalcNTLMv2_response(ses, ntlmv2_hash);
+        if (rc) {
+                cERROR(1, "Could not calculate CR1  rc: %d", rc);
+                goto setup_ntlmv2_rsp_ret;
+        }
+        /* now calculate the session key for NTLMv2 */
+        crypto_shash_setkey(ses->server->secmech.hmacmd5,
+                ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Could not init hmacmd5\n", __func__);
+                goto setup_ntlmv2_rsp_ret;
+        }
+        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                CIFS_HMAC_MD5_HASH_SIZE);
+        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response);
+setup_ntlmv2_rsp_ret:
+        kfree(tiblob);
+        return rc;
+}
+int
+calc_seckey(struct cifsSesInfo *ses)
+{
+        int rc;
+        struct crypto_blkcipher *tfm_arc4;
+        struct scatterlist sgin, sgout;
+        struct blkcipher_desc desc;
+        unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
+        get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
+        tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+        if (IS_ERR(tfm_arc4)) {
+                rc = PTR_ERR(tfm_arc4);
+                cERROR(1, "could not allocate crypto API arc4\n");
+                return rc;
+        }
+        desc.tfm = tfm_arc4;
+        crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
+                                        CIFS_SESS_KEY_SIZE);
+        sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
+        sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
+        rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+        if (rc) {
+                cERROR(1, "could not encrypt session key rc: %d\n", rc);
+                crypto_free_blkcipher(tfm_arc4);
+                return rc;
+        }
+        /* make secondary_key/nonce as session key */
+        memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
+        /* and make len as that of session key only */
+        ses->auth_key.len = CIFS_SESS_KEY_SIZE;
+        crypto_free_blkcipher(tfm_arc4);
+        return 0;
+}
+void
+cifs_crypto_shash_release(struct TCP_Server_Info *server)
+{
+        if (server->secmech.md5)
+                crypto_free_shash(server->secmech.md5);
-        /* now calculate the MAC key for NTLMv2 */
+        if (server->secmech.hmacmd5)
-        hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
+                crypto_free_shash(server->secmech.hmacmd5);
-        hmac_md5_update(resp_buf, 16, &context);
-        hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context);
-        memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf,
+        kfree(server->secmech.sdeschmacmd5);
-               sizeof(struct ntlmv2_resp));
-        ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp);
+        kfree(server->secmech.sdescmd5);
 }
-void CalcNTLMv2_response(const struct cifsSesInfo *ses,
+int
-                         char *v2_session_response)
+cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
 {
-        struct HMACMD5Context context;
+        int rc;
-        /* rest of v2 struct already generated */
+        unsigned int size;
-        memcpy(v2_session_response + 8, ses->server->cryptKey, 8);
-        hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
-        hmac_md5_update(v2_session_response+8,
+        server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
-                        sizeof(struct ntlmv2_resp) - 8, &context);
+        if (IS_ERR(server->secmech.hmacmd5)) {
+                cERROR(1, "could not allocate crypto hmacmd5\n");
+                return PTR_ERR(server->secmech.hmacmd5);
+        }
+        server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+        if (IS_ERR(server->secmech.md5)) {
+                cERROR(1, "could not allocate crypto md5\n");
+                rc = PTR_ERR(server->secmech.md5);
+                goto crypto_allocate_md5_fail;
+        }
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->secmech.hmacmd5);
+        server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+        if (!server->secmech.sdeschmacmd5) {
+                cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
+                rc = -ENOMEM;
+                goto crypto_allocate_hmacmd5_sdesc_fail;
+        }
+        server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+        server->secmech.sdeschmacmd5->shash.flags = 0x0;
-        hmac_md5_final(v2_session_response, &context);
-/*      cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->secmech.md5);
+        server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+        if (!server->secmech.sdescmd5) {
+                cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
+                rc = -ENOMEM;
+                goto crypto_allocate_md5_sdesc_fail;
+        }
+        server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+        server->secmech.sdescmd5->shash.flags = 0x0;
+        return 0;
+crypto_allocate_md5_sdesc_fail:
+        kfree(server->secmech.sdeschmacmd5);
+crypto_allocate_hmacmd5_sdesc_fail:
+        crypto_free_shash(server->secmech.md5);
+crypto_allocate_md5_fail:
+        crypto_free_shash(server->secmech.hmacmd5);
+        return rc;
 }
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
deleted file mode 100644
index 15d2ec006474..000000000000
--- a/fs/cifs/cifsencrypt.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *   fs/cifs/cifsencrypt.h
- *
- *   Copyright (c) International Business Machines  Corp., 2005
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   Externs for misc. small encryption routines
- *   so we do not have to put them in cifsproto.h
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-/* md4.c */
-extern void mdfour(unsigned char *out, unsigned char *in, int n);
-/* smbdes.c */
-extern void E_P16(unsigned char *p14, unsigned char *p16);
-extern void E_P24(unsigned char *p21, const unsigned char *c8,
-                  unsigned char *p24);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b7431afdd76d..f2970136d17d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,7 +35,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-#include <linux/smp_lock.h>
+#include <net/ipv6.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #define DECLARE_GLOBALS_HERE
@@ -77,11 +77,33 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
 module_param(cifs_max_pending, int, 0);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
                                   "Default: 50 Range: 2 to 256");
+unsigned short echo_retries = 5;
+module_param(echo_retries, ushort, 0644);
+MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
+                               "reconnecting server. Default: 5. 0 means "
+                               "never reconnect.");
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
+void
+cifs_sb_active(struct super_block *sb)
+{
+        struct cifs_sb_info *server = CIFS_SB(sb);
+        if (atomic_inc_return(&server->active) == 1)
+                atomic_inc(&sb->s_active);
+}
+void
+cifs_sb_deactive(struct super_block *sb)
+{
+        struct cifs_sb_info *server = CIFS_SB(sb);
+        if (atomic_dec_and_test(&server->active))
+                deactivate_super(sb);
+}
 static int
 cifs_read_super(struct super_block *sb, void *data,
                const char *devname, int silent)
@@ -97,6 +119,9 @@ cifs_read_super(struct super_block *sb, void *data,
        if (cifs_sb == NULL)
                return -ENOMEM;
+        spin_lock_init(&cifs_sb->tlink_tree_lock);
+        cifs_sb->tlink_tree = RB_ROOT;
        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
        if (rc) {
                kfree(cifs_sb);
@@ -136,9 +161,6 @@ cifs_read_super(struct super_block *sb, void *data,
        sb->s_magic = CIFS_MAGIC_NUMBER;
        sb->s_op = &cifs_super_ops;
        sb->s_bdi = &cifs_sb->bdi;
-/*      if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
-            sb->s_blocksize =
-                cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
        inode = cifs_root_iget(sb, ROOT_I);
@@ -156,6 +178,12 @@ cifs_read_super(struct super_block *sb, void *data,
                goto out_no_root;
        }
+        /* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
+        if (cifs_sb_master_tcon(cifs_sb)->nocase)
+                sb->s_d_op = &cifs_ci_dentry_ops;
+        else
+                sb->s_d_op = &cifs_dentry_ops;
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                cFYI(1, "export ops supported");
@@ -200,8 +228,6 @@ cifs_put_super(struct super_block *sb)
                return;
        }
-        lock_kernel();
        rc = cifs_umount(sb, cifs_sb);
        if (rc)
                cERROR(1, "cifs_umount failed with return code %d", rc);
@@ -215,8 +241,6 @@ cifs_put_super(struct super_block *sb)
        unload_nls(cifs_sb->local_nls);
        bdi_destroy(&cifs_sb->bdi);
        kfree(cifs_sb);
-        unlock_kernel();
 }
 static int
@@ -224,7 +248,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        int rc = -EOPNOTSUPP;
        int xid;
@@ -269,10 +293,13 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int cifs_permission(struct inode *inode, int mask)
+static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct cifs_sb_info *cifs_sb;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        cifs_sb = CIFS_SB(inode->i_sb);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
@@ -284,7 +311,7 @@ static int cifs_permission(struct inode *inode, int mask)
                on the client (above and beyond ACL on servers) for
                servers which do not support setting and viewing mode bits,
                so allowing client to check permissions is useful */
-                return generic_permission(inode, mask, NULL);
+                return generic_permission(inode, mask, flags, NULL);
 }
 static struct kmem_cache *cifs_inode_cachep;
@@ -304,16 +331,16 @@ cifs_alloc_inode(struct super_block *sb)
                return NULL;
        cifs_inode->cifsAttrs = 0x20;   /* default */
        cifs_inode->time = 0;
-        cifs_inode->write_behind_rc = 0;
        /* Until the file is open and we have gotten oplock
        info back from the server, can not assume caching of
        file data or metadata */
-        cifs_inode->clientCanCacheRead = false;
+        cifs_set_oplock_level(cifs_inode, 0);
-        cifs_inode->clientCanCacheAll = false;
        cifs_inode->delete_pending = false;
        cifs_inode->invalid_mapping = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
+        cifs_inode->uniqueid = 0;
+        cifs_inode->createtime = 0;
        /* Can not set i_flags here - they get immediately overwritten
           to zero by the VFS */
@@ -322,10 +349,17 @@ cifs_alloc_inode(struct super_block *sb)
        return &cifs_inode->vfs_inode;
 }
+static void cifs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+}
 static void
 cifs_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+        call_rcu(&inode->i_rcu, cifs_i_callback);
 }
 static void
@@ -339,18 +373,19 @@ cifs_evict_inode(struct inode *inode)
 static void
 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 {
+        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
        seq_printf(s, ",addr=");
-        switch (server->addr.sockAddr.sin_family) {
+        switch (server->dstaddr.ss_family) {
        case AF_INET:
-                seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr);
+                seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
                break;
        case AF_INET6:
-                seq_printf(s, "%pI6",
+                seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
-                           &server->addr.sockAddr6.sin6_addr.s6_addr);
+                if (sa6->sin6_scope_id)
-                if (server->addr.sockAddr6.sin6_scope_id)
+                        seq_printf(s, "%%%u", sa6->sin6_scope_id);
-                        seq_printf(s, "%%%u",
-                                   server->addr.sockAddr6.sin6_scope_id);
                break;
        default:
                seq_printf(s, "(unknown)");
@@ -366,14 +401,36 @@ static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct sockaddr *srcaddr;
+        srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
        seq_printf(s, ",unc=%s", tcon->treeName);
-        if (tcon->ses->userName)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
+                seq_printf(s, ",multiuser");
+        else if (tcon->ses->userName)
                seq_printf(s, ",username=%s", tcon->ses->userName);
        if (tcon->ses->domainName)
                seq_printf(s, ",domain=%s", tcon->ses->domainName);
+        if (srcaddr->sa_family != AF_UNSPEC) {
+                struct sockaddr_in *saddr4;
+                struct sockaddr_in6 *saddr6;
+                saddr4 = (struct sockaddr_in *)srcaddr;
+                saddr6 = (struct sockaddr_in6 *)srcaddr;
+                if (srcaddr->sa_family == AF_INET6)
+                        seq_printf(s, ",srcaddr=%pI6c",
+                                   &saddr6->sin6_addr);
+                else if (srcaddr->sa_family == AF_INET)
+                        seq_printf(s, ",srcaddr=%pI4",
+                                   &saddr4->sin_addr.s_addr);
+                else
+                        seq_printf(s, ",srcaddr=BAD-AF:%i",
+                                   (int)(srcaddr->sa_family));
+        }
        seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
                seq_printf(s, ",forceuid");
@@ -422,9 +479,15 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
                seq_printf(s, ",dynperm");
        if (m->mnt_sb->s_flags & MS_POSIXACL)
                seq_printf(s, ",acl");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+                seq_printf(s, ",mfsymlinks");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
+                seq_printf(s, ",fsc");
        seq_printf(s, ",rsize=%d", cifs_sb->rsize);
        seq_printf(s, ",wsize=%d", cifs_sb->wsize);
+        /* convert actimeo and display it in seconds */
+                seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
        return 0;
 }
@@ -437,20 +500,18 @@ static void cifs_umount_begin(struct super_block *sb)
        if (cifs_sb == NULL)
                return;
-        tcon = cifs_sb->tcon;
+        tcon = cifs_sb_master_tcon(cifs_sb);
-        if (tcon == NULL)
-                return;
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
                /* we have other mounts to same share or we have
                   already tried to force umount this and woken up
                   all waiting network requests, nothing to do */
-                read_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return;
        } else if (tcon->tc_count == 1)
                tcon->tidStatus = CifsExiting;
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
        /* cancel_notify_requests(tcon); */
@@ -509,28 +570,29 @@ static const struct super_operations cifs_super_ops = {
 #endif
 };
-static int
+static struct dentry *
-cifs_get_sb(struct file_system_type *fs_type,
+cifs_do_mount(struct file_system_type *fs_type,
-            int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+            int flags, const char *dev_name, void *data)
 {
        int rc;
-        struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
+        struct super_block *sb;
+        sb = sget(fs_type, NULL, set_anon_super, NULL);
        cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
        if (IS_ERR(sb))
-                return PTR_ERR(sb);
+                return ERR_CAST(sb);
        sb->s_flags = flags;
        rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
        if (rc) {
                deactivate_locked_super(sb);
-                return rc;
+                return ERR_PTR(rc);
        }
        sb->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 }
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
@@ -538,10 +600,17 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
        ssize_t written;
+        int rc;
        written = generic_file_aio_write(iocb, iov, nr_segs, pos);
-        if (!CIFS_I(inode)->clientCanCacheAll)
-                filemap_fdatawrite(inode->i_mapping);
+        if (CIFS_I(inode)->clientCanCacheAll)
+                return written;
+        rc = filemap_fdatawrite(inode->i_mapping);
+        if (rc)
+                cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
        return written;
 }
@@ -565,9 +634,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
-        /* note that this is called by vfs setlease with the BKL held
+        /* note that this is called by vfs setlease with lock_flocks held
-           although I doubt that BKL is needed here in cifs */
+           to protect *lease from going away */
        struct inode *inode = file->f_path.dentry->d_inode;
+        struct cifsFileInfo *cfile = file->private_data;
        if (!(S_ISREG(inode->i_mode)))
                return -EINVAL;
@@ -578,8 +648,8 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
            ((arg == F_WRLCK) &&
                (CIFS_I(inode)->clientCanCacheAll)))
                return generic_setlease(file, arg, lease);
-        else if (CIFS_SB(inode->i_sb)->tcon->local_lease &&
+        else if (tlink_tcon(cfile->tlink)->local_lease &&
-                        !CIFS_I(inode)->clientCanCacheRead)
+                 !CIFS_I(inode)->clientCanCacheRead)
                /* If the server claims to support oplock on this
                   file, then we still need to check oplock even
                   if the local_lease mount option is set, but there
@@ -595,7 +665,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 struct file_system_type cifs_fs_type = {
        .owner = THIS_MODULE,
        .name = "cifs",
-        .get_sb = cifs_get_sb,
+        .mount = cifs_do_mount,
        .kill_sb = kill_anon_super,
        /*  .fs_flags */
 };
@@ -670,6 +740,25 @@ const struct file_operations cifs_file_ops = {
        .setlease = cifs_setlease,
 };
+const struct file_operations cifs_file_strict_ops = {
+        .read = do_sync_read,
+        .write = do_sync_write,
+        .aio_read = cifs_strict_readv,
+        .aio_write = cifs_strict_writev,
+        .open = cifs_open,
+        .release = cifs_close,
+        .lock = cifs_lock,
+        .fsync = cifs_strict_fsync,
+        .flush = cifs_flush,
+        .mmap = cifs_file_strict_mmap,
+        .splice_read = generic_file_splice_read,
+        .llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+        .unlocked_ioctl = cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+        .setlease = cifs_setlease,
+};
 const struct file_operations cifs_file_direct_ops = {
        /* no aio, no readv -
           BB reevaluate whether they can be done with directio, no cache */
@@ -688,6 +777,7 @@ const struct file_operations cifs_file_direct_ops = {
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
 };
 const struct file_operations cifs_file_nobrl_ops = {
        .read = do_sync_read,
        .write = do_sync_write,
@@ -706,6 +796,24 @@ const struct file_operations cifs_file_nobrl_ops = {
        .setlease = cifs_setlease,
 };
+const struct file_operations cifs_file_strict_nobrl_ops = {
+        .read = do_sync_read,
+        .write = do_sync_write,
+        .aio_read = cifs_strict_readv,
+        .aio_write = cifs_strict_writev,
+        .open = cifs_open,
+        .release = cifs_close,
+        .fsync = cifs_strict_fsync,
+        .flush = cifs_flush,
+        .mmap = cifs_file_strict_mmap,
+        .splice_read = generic_file_splice_read,
+        .llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+        .unlocked_ioctl = cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+        .setlease = cifs_setlease,
+};
 const struct file_operations cifs_file_direct_nobrl_ops = {
        /* no mmap, no aio, no readv -
           BB reevaluate whether they can be done with directio, no cache */
@@ -897,9 +1005,8 @@ init_cifs(void)
        GlobalCurrentXid = 0;
        GlobalTotalActiveXid = 0;
        GlobalMaxActiveXid = 0;
-        memset(Local_System_Name, 0, 15);
+        spin_lock_init(&cifs_tcp_ses_lock);
-        rwlock_init(&GlobalSMBSeslock);
+        spin_lock_init(&cifs_file_list_lock);
-        rwlock_init(&cifs_tcp_ses_lock);
        spin_lock_init(&GlobalMid_Lock);
        if (cifs_max_pending < 2) {
@@ -912,11 +1019,11 @@ init_cifs(void)
        rc = cifs_fscache_register();
        if (rc)
-                goto out;
+                goto out_clean_proc;
        rc = cifs_init_inodecache();
        if (rc)
-                goto out_clean_proc;
+                goto out_unreg_fscache;
        rc = cifs_init_mids();
        if (rc)
@@ -938,19 +1045,19 @@ init_cifs(void)
        return 0;
 #ifdef CONFIG_CIFS_UPCALL
- out_unregister_filesystem:
+out_unregister_filesystem:
        unregister_filesystem(&cifs_fs_type);
 #endif
- out_destroy_request_bufs:
+out_destroy_request_bufs:
        cifs_destroy_request_bufs();
- out_destroy_mids:
+out_destroy_mids:
        cifs_destroy_mids();
- out_destroy_inodecache:
+out_destroy_inodecache:
        cifs_destroy_inodecache();
- out_clean_proc:
+out_unreg_fscache:
-        cifs_proc_clean();
        cifs_fscache_unregister();
- out:
+out_clean_proc:
+        cifs_proc_clean();
        return rc;
 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d82f5fb4761e..4a3330235d55 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -42,10 +42,8 @@ extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
 /* Functions related to super block operations */
-/* extern const struct super_operations cifs_super_ops;*/
+extern void cifs_sb_active(struct super_block *sb);
-extern void cifs_read_inode(struct inode *);
+extern void cifs_sb_deactive(struct super_block *sb);
-/*extern void cifs_delete_inode(struct inode *);*/  /* BB not needed yet */
-/* extern void cifs_write_inode(struct inode *); */ /* BB not needed yet */
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
@@ -63,6 +61,7 @@ extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
                       struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
 extern int cifs_revalidate_dentry(struct dentry *);
+extern void cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
@@ -74,19 +73,27 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations;
 /* Functions related to files and directories */
 extern const struct file_operations cifs_file_ops;
 extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
-extern const struct file_operations cifs_file_nobrl_ops;
+extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
-extern const struct file_operations cifs_file_direct_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_direct_nobrl_ops;
+extern const struct file_operations cifs_file_strict_nobrl_ops;
 extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
-                         size_t read_size, loff_t *poffset);
+                              size_t read_size, loff_t *poffset);
+extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+                                 unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
-                         size_t write_size, loff_t *poffset);
+                               size_t write_size, loff_t *poffset);
+extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+                                  unsigned long nr_segs, loff_t pos);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, int);
+extern int cifs_strict_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
@@ -95,6 +102,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
 extern const struct dentry_operations cifs_dentry_ops;
 extern const struct dentry_operations cifs_ci_dentry_ops;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
+#else
+#define cifs_dfs_d_automount NULL
+#endif
 /* Functions related to symlinks */
 extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
 extern void cifs_put_link(struct dentry *direntry,
@@ -104,7 +117,7 @@ extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
                        const char *symname);
 extern int      cifs_removexattr(struct dentry *, const char *);
-extern int      cifs_setxattr(struct dentry *, const char *, const void *,
+extern int      cifs_setxattr(struct dentry *, const char *, const void *,
                        size_t, int);
 extern ssize_t  cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t  cifs_listxattr(struct dentry *, char *, size_t);
@@ -114,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.65"
+#define CIFS_VERSION   "1.70"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0cdfb8c32ac6..edd5b29b53c9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -25,6 +25,9 @@
 #include <linux/workqueue.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
+#include <crypto/internal/hash.h>
+#include <linux/scatterlist.h>
 /*
 * The sizes of various internal tables and strings
 */
@@ -42,6 +45,16 @@
 #define CIFS_MIN_RCV_POOL 4
 /*
+ * default attribute cache timeout (jiffies)
+ */
+#define CIFS_DEF_ACTIMEO (1 * HZ)
+/*
+ * max attribute cache timeout (jiffies) - 2^30
+ */
+#define CIFS_MAX_ACTIMEO (1 << 30)
+/*
 * MAX_REQ is the maximum number of requests that WE will send
 * on one socket concurrently. It also matches the most common
 * value of max multiplex returned by servers.  We may
@@ -74,7 +87,7 @@
 * CIFS vfs client Status information (based on what we know.)
 */
- /* associated with each tcp and smb session */
+/* associated with each tcp and smb session */
 enum statusEnum {
        CifsNew = 0,
        CifsGood,
@@ -97,16 +110,31 @@ enum protocolEnum {
        /* Netbios frames protocol not supported at this time */
 };
-struct mac_key {
+struct session_key {
        unsigned int len;
-        union {
+        char *response;
-                char ntlm[CIFS_SESS_KEY_SIZE + 16];
+};
-                char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */
-                struct {
+/* crypto security descriptor definition */
-                        char key[16];
+struct sdesc {
-                        struct ntlmv2_resp resp;
+        struct shash_desc shash;
-                } ntlmv2;
+        char ctx[];
-        } data;
+};
+/* crypto hashing related structure/fields, not specific to a sec mech */
+struct cifs_secmech {
+        struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
+        struct crypto_shash *md5; /* md5 hash function */
+        struct sdesc *sdeschmacmd5;  /* ctxt to generate ntlmv2 hash, CR1 */
+        struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
+};
+/* per smb session structure/fields */
+struct ntlmssp_auth {
+        __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
+        __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
+        unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
+        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
 };
 struct cifs_cred {
@@ -133,34 +161,27 @@ struct TCP_Server_Info {
        int srv_count; /* reference counter */
        /* 15 character server name + 0x20 16th byte indicating type = srv */
        char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+        enum statusEnum tcpStatus; /* what we think the status is */
        char *hostname; /* hostname portion of UNC string */
        struct socket *ssocket;
-        union {
+        struct sockaddr_storage dstaddr;
-                struct sockaddr_in sockAddr;
+        struct sockaddr_storage srcaddr; /* locally bind to this IP */
-                struct sockaddr_in6 sockAddr6;
+#ifdef CONFIG_NET_NS
-        } addr;
+        struct net *net;
+#endif
        wait_queue_head_t response_q;
        wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
        struct list_head pending_mid_q;
-        void *Server_NlsInfo;   /* BB - placeholder for future NLS info  */
-        unsigned short server_codepage; /* codepage for the server    */
-        enum protocolEnum protocolType;
-        char versionMajor;
-        char versionMinor;
-        bool svlocal:1;                 /* local server or remote */
        bool noblocksnd;                /* use blocking sendmsg */
        bool noautotune;                /* do not autotune send buf sizes */
        bool tcp_nodelay;
        atomic_t inFlight;  /* number of requests on the wire to server */
-#ifdef CONFIG_CIFS_STATS2
-        atomic_t inSend; /* requests trying to send */
-        atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
-#endif
-        enum statusEnum tcpStatus; /* what we think the status is */
        struct mutex srv_mutex;
        struct task_struct *tsk;
        char server_GUID[16];
        char secMode;
+        bool session_estab; /* mark when very first sess is established */
+        u16 dialect; /* dialect index that server chose */
        enum securityEnum secType;
        unsigned int maxReq;    /* Clients should submit no more */
        /* than maxReq distinct unanswered SMBs to the server when using  */
@@ -173,30 +194,62 @@ struct TCP_Server_Info {
        unsigned int max_vcs;   /* maximum number of smb sessions, at least
                                   those that can be specified uniquely with
                                   vcnumbers */
-        char sessid[4];         /* unique token id for this session */
-        /* (returned on Negotiate */
        int capabilities; /* allow selective disabling of caps by smb sess */
        int timeAdj;  /* Adjust for difference in server time zone in sec */
        __u16 CurrentMid;         /* multiplex id - rotating counter */
-        char cryptKey[CIFS_CRYPTO_KEY_SIZE];
+        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
        /* 16th byte of RFC1001 workstation name is always null */
        char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
-        __u32 sequence_number; /* needed for CIFS PDU signature */
+        __u32 sequence_number; /* for signing, protected by srv_mutex */
-        struct mac_key mac_signing_key;
+        struct session_key session_key;
-        char ntlmv2_hash[16];
        unsigned long lstrp; /* when we got last response from this server */
-        u16 dialect; /* dialect index that server chose */
+        struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
        /* extended security flavors that server supports */
+        bool    sec_ntlmssp;            /* supports NTLMSSP */
+        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
        bool    sec_kerberos;           /* supports plain Kerberos */
        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
-        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
+        struct delayed_work     echo; /* echo ping workqueue job */
-        bool    sec_ntlmssp;            /* supports NTLMSSP */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
+#ifdef CONFIG_CIFS_STATS2
+        atomic_t inSend; /* requests trying to send */
+        atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
+#endif
 };
 /*
+ * Macros to allow the TCP_Server_Info->net field and related code to drop out
+ * when CONFIG_NET_NS isn't set.
+ */
+#ifdef CONFIG_NET_NS
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+        return srv->net;
+}
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+        srv->net = net;
+}
+#else
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+        return &init_net;
+}
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+}
+#endif
+/*
 * Session structure.  One of these for each uid session with a particular host
 */
 struct cifsSesInfo {
@@ -222,6 +275,8 @@ struct cifsSesInfo {
        char userName[MAX_USERNAME_SIZE + 1];
        char *domainName;
        char *password;
+        struct session_key auth_key;
+        struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
        bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
@@ -308,6 +363,45 @@ struct cifsTconInfo {
 };
 /*
+ * This is a refcounted and timestamped container for a tcon pointer. The
+ * container holds a tcon reference. It is considered safe to free one of
+ * these when the tl_count goes to 0. The tl_time is the time of the last
+ * "get" on the container.
+ */
+struct tcon_link {
+        struct rb_node          tl_rbnode;
+        uid_t                   tl_uid;
+        unsigned long           tl_flags;
+#define TCON_LINK_MASTER        0
+#define TCON_LINK_PENDING       1
+#define TCON_LINK_IN_TREE       2
+        unsigned long           tl_time;
+        atomic_t                tl_count;
+        struct cifsTconInfo     *tl_tcon;
+};
+extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
+static inline struct cifsTconInfo *
+tlink_tcon(struct tcon_link *tlink)
+{
+        return tlink->tl_tcon;
+}
+extern void cifs_put_tlink(struct tcon_link *tlink);
+static inline struct tcon_link *
+cifs_get_tlink(struct tcon_link *tlink)
+{
+        if (tlink && !IS_ERR(tlink))
+                atomic_inc(&tlink->tl_count);
+        return tlink;
+}
+/* This function is always expected to succeed */
+extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
+/*
 * This info hangs off the cifsFileInfo structure, pointed to by llist.
 * This is used to track byte stream locks on the file
 */
@@ -345,34 +439,29 @@ struct cifsFileInfo {
        __u16 netfid;           /* file id from remote */
        /* BB add lock scope info here if needed */ ;
        /* lock scope id (0 if none) */
-        struct file *pfile; /* needed for writepage */
+        struct dentry *dentry;
-        struct inode *pInode; /* needed for oplock break */
+        unsigned int f_flags;
-        struct vfsmount *mnt;
+        struct tcon_link *tlink;
        struct mutex lock_mutex;
        struct list_head llist; /* list of byte range locks we have. */
-        bool closePend:1;       /* file is marked to close */
        bool invalidHandle:1;   /* file closed via session abend */
        bool oplock_break_cancelled:1;
-        atomic_t count;         /* reference count */
+        int count;              /* refcount protected by cifs_file_list_lock */
        struct mutex fh_mutex; /* prevents reopen race after dead ses*/
        struct cifs_search_info srch_inf;
        struct work_struct oplock_break; /* work for oplock breaks */
 };
-/* Take a reference on the file private data */
+/*
+ * Take a reference on the file private data. Must be called with
+ * cifs_file_list_lock held.
+ */
 static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
 {
-        atomic_inc(&cifs_file->count);
+        ++cifs_file->count;
 }
-/* Release a reference on the file private data */
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
-static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
-{
-        if (atomic_dec_and_test(&cifs_file->count)) {
-                iput(cifs_file->pInode);
-                kfree(cifs_file);
-        }
-}
 /*
 * One of these for each file inode
@@ -382,15 +471,15 @@ struct cifsInodeInfo {
        struct list_head lockList;
        /* BB add in lists for dirty pages i.e. write caching info for oplock */
        struct list_head openFileList;
-        int write_behind_rc;
        __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
-        unsigned long time;     /* jiffies of last update/check of inode */
+        bool clientCanCacheRead;        /* read oplock */
-        bool clientCanCacheRead:1;      /* read oplock */
+        bool clientCanCacheAll;         /* read and writebehind oplock */
-        bool clientCanCacheAll:1;       /* read and writebehind oplock */
+        bool delete_pending;            /* DELETE_ON_CLOSE is set */
-        bool delete_pending:1;          /* DELETE_ON_CLOSE is set */
+        bool invalid_mapping;           /* pagecache is invalid */
-        bool invalid_mapping:1;         /* pagecache is invalid */
+        unsigned long time;             /* jiffies of last update of inode */
        u64  server_eof;                /* current file size on server */
        u64  uniqueid;                  /* server inode number */
+        u64  createtime;                /* creation time on server */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie *fscache;
 #endif
@@ -445,6 +534,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
 #endif
+struct mid_q_entry;
+/*
+ * This is the prototype for the mid callback function. When creating one,
+ * take special care to avoid deadlocks. Things to bear in mind:
+ *
+ * - it will be called by cifsd
+ * - the GlobalMid_Lock will be held
+ * - the mid will be removed from the pending_mid_q list
+ */
+typedef void (mid_callback_t)(struct mid_q_entry *mid);
 /* one of these for every pending CIFS request to the server */
 struct mid_q_entry {
        struct list_head qhead; /* mids waiting on reply from this server */
@@ -456,7 +557,8 @@ struct mid_q_entry {
        unsigned long when_sent; /* time when smb send finished */
        unsigned long when_received; /* when demux complete (taken off wire) */
 #endif
-        struct task_struct *tsk;        /* task waiting for response */
+        mid_callback_t *callback; /* call completion callback */
+        void *callback_data;      /* general purpose pointer for callback */
        struct smb_hdr *resp_buf;       /* response buffer */
        int midState;   /* wish this were enum but can not pass to wait_event */
        __u8 command;   /* smb command code */
@@ -474,16 +576,16 @@ struct oplock_q_entry {
 /* for pending dnotify requests */
 struct dir_notify_req {
-       struct list_head lhead;
+        struct list_head lhead;
-       __le16 Pid;
+        __le16 Pid;
-       __le16 PidHigh;
+        __le16 PidHigh;
-       __u16 Mid;
+        __u16 Mid;
-       __u16 Tid;
+        __u16 Tid;
-       __u16 Uid;
+        __u16 Uid;
-       __u16 netfid;
+        __u16 netfid;
-       __u32 filter; /* CompletionFilter (for multishot) */
+        __u32 filter; /* CompletionFilter (for multishot) */
-       int multishot;
+        int multishot;
-       struct file *pfile;
+        struct file *pfile;
 };
 struct dfs_info3_param {
@@ -511,6 +613,7 @@ struct cifs_fattr {
        u64             cf_uniqueid;
        u64             cf_eof;
        u64             cf_bytes;
+        u64             cf_createtime;
        uid_t           cf_uid;
        gid_t           cf_gid;
        umode_t         cf_mode;
@@ -558,12 +661,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   CIFS_IOVEC            4    /* array of response buffers */
 /* Type of Request to SendReceive2 */
-#define   CIFS_STD_OP           0    /* normal request timeout */
+#define   CIFS_BLOCKING_OP      1    /* operation can block */
-#define   CIFS_LONG_OP          1    /* long op (up to 45 sec, oplock time) */
+#define   CIFS_ASYNC_OP         2    /* do not wait for response */
-#define   CIFS_VLONG_OP         2    /* sloow op - can take up to 180 seconds */
+#define   CIFS_TIMEOUT_MASK 0x003    /* only one of above set in req */
-#define   CIFS_BLOCKING_OP      4    /* operation can block */
-#define   CIFS_ASYNC_OP         8    /* do not wait for response */
-#define   CIFS_TIMEOUT_MASK 0x00F    /* only one of 5 above set in req */
 #define   CIFS_LOG_ERROR    0x010    /* log NT STATUS if non-zero */
 #define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
 #define   CIFS_NO_RESP      0x040    /* no response buffer required */
@@ -633,7 +733,7 @@ require use of the stronger protocol */
 *  GlobalMid_Lock protects:
 *      list operations on pending_mid_q and oplockQ
 *      updates to XID counters, multiplex id  and SMB sequence numbers
- *  GlobalSMBSesLock protects:
+ *  cifs_file_list_lock protects:
 *      list operations on tcp and SMB session lists and tCon lists
 *  f_owner.lock protects certain per file struct operations
 *  mapping->page_lock protects certain per page operations
@@ -667,7 +767,7 @@ GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
 * the reference counters for the server, smb session, and tcon. Finally,
 * changes to the tcon->tidStatus should be done while holding this lock.
 */
-GLOBAL_EXTERN rwlock_t          cifs_tcp_ses_lock;
+GLOBAL_EXTERN spinlock_t                cifs_tcp_ses_lock;
 /*
 * This lock protects the cifs_file->llist and cifs_file->flist
@@ -676,7 +776,7 @@ GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
 * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
 * the cifs_tcp_ses_lock must be grabbed first and released last.
 */
-GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
+GLOBAL_EXTERN spinlock_t        cifs_file_list_lock;
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
@@ -691,8 +791,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN unsigned int GlobalMaxActiveXid;  /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above & list operations */
                                          /* on midQ entries */
-GLOBAL_EXTERN char Local_System_Name[15];
 /*
 *  Global counters, updated atomically
 */
@@ -728,6 +826,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
 GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+/* reconnect after this many failed echo attempts */
+GLOBAL_EXTERN unsigned short echo_retries;
 void cifs_oplock_break(struct work_struct *work);
 void cifs_oplock_break_get(struct cifsFileInfo *cfile);
 void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db11..b5c8cc5d7a7f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -23,6 +23,7 @@
 #define _CIFSPDU_H
 #include <net/sock.h>
+#include <asm/unaligned.h>
 #include "smbfsctl.h"
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -50,6 +51,7 @@
 #define SMB_COM_SETATTR               0x09 /* trivial response */
 #define SMB_COM_LOCKING_ANDX          0x24 /* trivial response */
 #define SMB_COM_COPY                  0x29 /* trivial rsp, fail filename ignrd*/
+#define SMB_COM_ECHO                  0x2B /* echo request */
 #define SMB_COM_OPEN_ANDX             0x2D /* Legacy open for old servers */
 #define SMB_COM_READ_ANDX             0x2E
 #define SMB_COM_WRITE_ANDX            0x2F
@@ -131,9 +133,20 @@
 #define CIFS_CRYPTO_KEY_SIZE (8)
 /*
+ * Size of the ntlm client response
+ */
+#define CIFS_AUTH_RESP_SIZE (24)
+/*
 * Size of the session key (crypto key encrypted with the password
 */
-#define CIFS_SESS_KEY_SIZE (24)
+#define CIFS_SESS_KEY_SIZE (16)
+#define CIFS_CLIENT_CHALLENGE_SIZE (8)
+#define CIFS_SERVER_CHALLENGE_SIZE (8)
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
 /*
 * Maximum user name length
@@ -414,11 +427,49 @@ struct smb_hdr {
        __u16 Mid;
        __u8 WordCount;
 } __attribute__((packed));
-/* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
+/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
-#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
+#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
+                         (2 * (smb_var)->WordCount))
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
-#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
+#define pByteArea(smb_var) (BCC(smb_var) + 2)
+/* get the converted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc(struct smb_hdr *hdr)
+{
+        __u16 *bc_ptr = (__u16 *)BCC(hdr);
+        return get_unaligned(bc_ptr);
+}
+/* get the unconverted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc_le(struct smb_hdr *hdr)
+{
+        __le16 *bc_ptr = (__le16 *)BCC(hdr);
+        return get_unaligned_le16(bc_ptr);
+}
+/* set the ByteCount for a SMB packet in host-byte order */
+static inline void
+put_bcc(__u16 count, struct smb_hdr *hdr)
+{
+        __u16 *bc_ptr = (__u16 *)BCC(hdr);
+        put_unaligned(count, bc_ptr);
+}
+/* set the ByteCount for a SMB packet in little-endian */
+static inline void
+put_bcc_le(__u16 count, struct smb_hdr *hdr)
+{
+        __le16 *bc_ptr = (__le16 *)BCC(hdr);
+        put_unaligned_le16(count, bc_ptr);
+}
 /*
 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
@@ -663,7 +714,6 @@ struct ntlmv2_resp {
        __le64  time;
        __u64  client_chal; /* random */
        __u32  reserved2;
-        struct ntlmssp2_name names[2];
        /* array of name entries could follow ending in minimum 4 byte struct */
 } __attribute__((packed));
@@ -750,6 +800,20 @@ typedef struct smb_com_tconx_rsp_ext {
 *
 */
+typedef struct smb_com_echo_req {
+        struct  smb_hdr hdr;
+        __le16  EchoCount;
+        __le16  ByteCount;
+        char    Data[1];
+} __attribute__((packed)) ECHO_REQ;
+typedef struct smb_com_echo_rsp {
+        struct  smb_hdr hdr;
+        __le16  SequenceNumber;
+        __le16  ByteCount;
+        char    Data[1];
+} __attribute__((packed)) ECHO_RSP;
 typedef struct smb_com_logoff_andx_req {
        struct smb_hdr hdr;     /* wct = 2 */
        __u8 AndXCommand;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1d60c655e3e0..8096f27ad9a8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -54,12 +54,19 @@ do {								\
             __func__, curr_xid, (int)rc);                      \
 } while (0)
 extern char *build_path_from_dentry(struct dentry *);
-extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+                                        struct cifsTconInfo *tcon);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
                const char *fullpath, const struct dfs_info3_param *ref,
                char **devname);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
+extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
+                                        struct TCP_Server_Info *server);
+extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
+extern int cifs_call_async(struct TCP_Server_Info *server,
+                           struct smb_hdr *in_buf, mid_callback_t *callback,
+                           void *cbdata);
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
                        struct smb_hdr * /* input */ ,
                        struct smb_hdr * /* out */ ,
@@ -78,10 +85,10 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
                                  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
-extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
+extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+                            unsigned int bytes_written);
-extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
+extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
-#endif
+extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
@@ -104,13 +111,14 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
                                      int offset);
+extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
-extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
+extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
-                                __u16 fileHandle, struct file *file,
+                                struct file *file, struct tcon_link *tlink,
-                                struct vfsmount *mnt, unsigned int oflags);
+                                __u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
                                struct super_block *sb,
-                                int mode, int oflags,
+                                int mode, unsigned int f_flags,
                                __u32 *poplock, __u16 *pnetfid, int xid);
 void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
@@ -129,10 +137,12 @@ extern int cifs_get_file_info_unix(struct file *filp);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
-extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
+extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
                              struct cifs_fattr *fattr, struct inode *inode,
                              const char *path, const __u16 *pfid);
-extern int mode_to_acl(struct inode *inode, const char *path, __u64);
+extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
+                                        const char *, u32 *);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
                        const char *);
@@ -345,12 +355,13 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
                        const __u16 netfid, const __u64 len,
                        const __u64 offset, const __u32 numUnlock,
                        const __u32 numLock, const __u8 lockType,
-                        const bool waitFlag);
+                        const bool waitFlag, const __u8 oplock_level);
 extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
                        const __u16 smb_file_id, const int get_flag,
                        const __u64 len, struct file_lock *,
                        const __u16 lock_type, const bool waitFlag);
 extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBEcho(struct TCP_Server_Info *server);
 extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
 extern struct cifsSesInfo *sesInfoAlloc(void);
@@ -362,13 +373,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
                          __u32 *);
 extern int cifs_verify_signature(struct smb_hdr *,
-                                 const struct mac_key *mac_key,
+                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
-extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
-                                 const char *pass);
+extern int setup_ntlm_response(struct cifsSesInfo *);
-extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
+extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
-extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
+extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
-                             const struct nls_table *);
+extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern int calc_seckey(struct cifsSesInfo *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(const char *password, const char *cryptkey,
                                bool encrypt, char *lnm_session_key);
@@ -408,4 +421,15 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
                        const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
+extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
+extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+                const unsigned char *path,
+                struct cifs_sb_info *cifs_sb, int xid);
+extern int mdfour(unsigned char *, unsigned char *, int);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+                        unsigned char *p24);
+extern void E_P16(unsigned char *p14, unsigned char *p16);
+extern void E_P24(unsigned char *p21, const unsigned char *c8,
+                        unsigned char *p24);
 #endif                  /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7e83b356cc9e..904aa47e3515 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -91,13 +91,13 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
        struct list_head *tmp1;
 /* list all files open on tree connection and mark them invalid */
-        write_lock(&GlobalSMBSeslock);
+        spin_lock(&cifs_file_list_lock);
        list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
                open_file = list_entry(tmp, struct cifsFileInfo, tlist);
                open_file->invalidHandle = true;
                open_file->oplock_break_cancelled = true;
        }
-        write_unlock(&GlobalSMBSeslock);
+        spin_unlock(&cifs_file_list_lock);
        /* BB Add call to invalidate_inodes(sb) for all superblocks mounted
           to this tcon */
 }
@@ -136,9 +136,6 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                }
        }
-        if (ses->status == CifsExiting)
-                return -EIO;
        /*
         * Give demultiplex thread up to 10 seconds to reconnect, should be
         * greater than cifs socket timeout which is 7 seconds
@@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                 * retrying until process is killed or server comes
                 * back on-line
                 */
-                if (!tcon->retry || ses->status == CifsExiting) {
+                if (!tcon->retry) {
                        cFYI(1, "gave up waiting on reconnect in smb_init");
                        return -EHOSTDOWN;
                }
@@ -331,37 +328,35 @@ smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
 static int validate_t2(struct smb_t2_rsp *pSMB)
 {
-        int rc = -EINVAL;
+        unsigned int total_size;
-        int total_size;
-        char *pBCC;
+        /* check for plausible wct */
+        if (pSMB->hdr.WordCount < 10)
+                goto vt2_err;
-        /* check for plausible wct, bcc and t2 data and parm sizes */
        /* check for parm and data offset going beyond end of smb */
-        if (pSMB->hdr.WordCount >= 10) {
+        if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
-                if ((le16_to_cpu(pSMB->t2_rsp.ParameterOffset) <= 1024) &&
+            get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
-                   (le16_to_cpu(pSMB->t2_rsp.DataOffset) <= 1024)) {
+                goto vt2_err;
-                        /* check that bcc is at least as big as parms + data */
-                        /* check that bcc is less than negotiated smb buffer */
+        /* check that bcc is at least as big as parms + data */
-                        total_size = le16_to_cpu(pSMB->t2_rsp.ParameterCount);
+        /* check that bcc is less than negotiated smb buffer */
-                        if (total_size < 512) {
+        total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
-                                total_size +=
+        if (total_size >= 512)
-                                        le16_to_cpu(pSMB->t2_rsp.DataCount);
+                goto vt2_err;
-                                /* BCC le converted in SendReceive */
-                                pBCC = (pSMB->hdr.WordCount * 2) +
+        total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
-                                        sizeof(struct smb_hdr) +
+        if (total_size > get_bcc(&pSMB->hdr) ||
-                                        (char *)pSMB;
+            total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
-                                if ((total_size <= (*(u16 *)pBCC)) &&
+                goto vt2_err;
-                                   (total_size <
-                                        CIFSMaxBufSize+MAX_CIFS_HDR_SIZE)) {
+        return 0;
-                                        return 0;
+vt2_err:
-                                }
-                        }
-                }
-        }
        cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
                sizeof(struct smb_t2_rsp) + 16);
-        return rc;
+        return -EINVAL;
 }
 int
 CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 {
@@ -401,15 +396,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
                cFYI(1, "Kerberos only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        }
+        } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
                cFYI(1, "NTLMSSP only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
-#endif
        count = 0;
        for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -455,7 +447,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
                                (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
                server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
-                GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
                /* even though we do not use raw we might as well set this
                accurately, in case we ever find a need for it */
                if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -503,7 +494,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                if (rsp->EncryptionKeyLength ==
                                cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
-                        memcpy(server->cryptKey, rsp->EncryptionKey,
+                        memcpy(ses->server->cryptkey, rsp->EncryptionKey,
                                CIFS_CRYPTO_KEY_SIZE);
                } else if (server->secMode & SECMODE_PW_ENCRYPT) {
                        rc = -EIO; /* need cryptkey unless plain text */
@@ -569,12 +560,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
        server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
        cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
-        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
        server->timeAdj *= 60;
        if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
-                memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
+                memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
                       CIFS_CRYPTO_KEY_SIZE);
        } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
                        && (pSMBr->EncryptionKeyLength == 0)) {
@@ -593,9 +583,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        rc = -EIO;
                        goto neg_err_exit;
                }
-                read_lock(&cifs_tcp_ses_lock);
+                spin_lock(&cifs_tcp_ses_lock);
                if (server->srv_count > 1) {
-                        read_unlock(&cifs_tcp_ses_lock);
+                        spin_unlock(&cifs_tcp_ses_lock);
                        if (memcmp(server->server_GUID,
                                   pSMBr->u.extended_response.
                                   GUID, 16) != 0) {
@@ -605,7 +595,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                                        16);
                        }
                } else {
-                        read_unlock(&cifs_tcp_ses_lock);
+                        spin_unlock(&cifs_tcp_ses_lock);
                        memcpy(server->server_GUID,
                               pSMBr->u.extended_response.GUID, 16);
                }
@@ -620,13 +610,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                                rc = 0;
                        else
                                rc = -EINVAL;
+                        if (server->secType == Kerberos) {
-                        if (server->sec_kerberos || server->sec_mskerberos)
+                                if (!server->sec_kerberos &&
-                                server->secType = Kerberos;
+                                                !server->sec_mskerberos)
-                        else if (server->sec_ntlmssp)
+                                        rc = -EOPNOTSUPP;
-                                server->secType = RawNTLMSSP;
+                        } else if (server->secType == RawNTLMSSP) {
-                        else
+                                if (!server->sec_ntlmssp)
-                                rc = -EOPNOTSUPP;
+                                        rc = -EOPNOTSUPP;
+                        } else
+                                        rc = -EOPNOTSUPP;
                }
        } else
                server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -707,6 +699,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        return rc;
 }
+/*
+ * This is a no-op for now. We're not really interested in the reply, but
+ * rather in the fact that the server sent one and that server->lstrp
+ * gets updated.
+ *
+ * FIXME: maybe we should consider checking that the reply matches request?
+ */
+static void
+cifs_echo_callback(struct mid_q_entry *mid)
+{
+        struct TCP_Server_Info *server = mid->callback_data;
+        DeleteMidQEntry(mid);
+        atomic_dec(&server->inFlight);
+        wake_up(&server->request_q);
+}
+int
+CIFSSMBEcho(struct TCP_Server_Info *server)
+{
+        ECHO_REQ *smb;
+        int rc = 0;
+        cFYI(1, "In echo request");
+        rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
+        if (rc)
+                return rc;
+        /* set up echo request */
+        smb->hdr.Tid = cpu_to_le16(0xffff);
+        smb->hdr.WordCount = 1;
+        put_unaligned_le16(1, &smb->EchoCount);
+        put_bcc_le(1, &smb->hdr);
+        smb->Data[0] = 'a';
+        smb->hdr.smb_buf_length += 3;
+        rc = cifs_call_async(server, (struct smb_hdr *)smb,
+                                cifs_echo_callback, server);
+        if (rc)
+                cFYI(1, "Echo request failed: %d", rc);
+        cifs_small_buf_release(smb);
+        return rc;
+}
 int
 CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 {
@@ -1194,7 +1233,7 @@ OldOpenRetry:
        pSMB->ByteCount = cpu_to_le16(count);
        /* long_op set to 1 to allow for oplock break timeouts */
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+                        (struct smb_hdr *)pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
                cFYI(1, "Error in Open = %d", rc);
@@ -1307,7 +1346,7 @@ openRetry:
        pSMB->ByteCount = cpu_to_le16(count);
        /* long_op set to 1 to allow for oplock break timeouts */
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+                        (struct smb_hdr *)pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
                cFYI(1, "Error in Open = %d", rc);
@@ -1389,7 +1428,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        iov[0].iov_base = (char *)pSMB;
        iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
        rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
-                         &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR);
+                         &resp_buf_type, CIFS_LOG_ERROR);
        cifs_stats_inc(&tcon->num_reads);
        pSMBr = (READ_RSP *)iov[0].iov_base;
        if (rc) {
@@ -1664,7 +1703,8 @@ int
 CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
            const __u16 smb_file_id, const __u64 len,
            const __u64 offset, const __u32 numUnlock,
-            const __u32 numLock, const __u8 lockType, const bool waitFlag)
+            const __u32 numLock, const __u8 lockType,
+            const bool waitFlag, const __u8 oplock_level)
 {
        int rc = 0;
        LOCK_REQ *pSMB = NULL;
@@ -1692,6 +1732,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        pSMB->NumberOfLocks = cpu_to_le16(numLock);
        pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
        pSMB->LockType = lockType;
+        pSMB->OplockLevel = oplock_level;
        pSMB->AndXCommand = 0xFF;       /* none */
        pSMB->Fid = smb_file_id; /* netfid stays le */
@@ -2476,95 +2517,6 @@ querySymLinkRetry:
 }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-/* Initialize NT TRANSACT SMB into small smb request buffer.
-   This assumes that all NT TRANSACTS that we init here have
-   total parm and data under about 400 bytes (to fit in small cifs
-   buffer size), which is the case so far, it easily fits. NB:
-        Setup words themselves and ByteCount
-        MaxSetupCount (size of returned setup area) and
-        MaxParameterCount (returned parms size) must be set by caller */
-static int
-smb_init_nttransact(const __u16 sub_command, const int setup_count,
-                   const int parm_len, struct cifsTconInfo *tcon,
-                   void **ret_buf)
-{
-        int rc;
-        __u32 temp_offset;
-        struct smb_com_ntransact_req *pSMB;
-        rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
-                                (void **)&pSMB);
-        if (rc)
-                return rc;
-        *ret_buf = (void *)pSMB;
-        pSMB->Reserved = 0;
-        pSMB->TotalParameterCount = cpu_to_le32(parm_len);
-        pSMB->TotalDataCount  = 0;
-        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-        pSMB->ParameterCount = pSMB->TotalParameterCount;
-        pSMB->DataCount  = pSMB->TotalDataCount;
-        temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
-                        (setup_count * 2) - 4 /* for rfc1001 length itself */;
-        pSMB->ParameterOffset = cpu_to_le32(temp_offset);
-        pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
-        pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
-        pSMB->SubCommand = cpu_to_le16(sub_command);
-        return 0;
-}
-static int
-validate_ntransact(char *buf, char **ppparm, char **ppdata,
-                   __u32 *pparmlen, __u32 *pdatalen)
-{
-        char *end_of_smb;
-        __u32 data_count, data_offset, parm_count, parm_offset;
-        struct smb_com_ntransact_rsp *pSMBr;
-        *pdatalen = 0;
-        *pparmlen = 0;
-        if (buf == NULL)
-                return -EINVAL;
-        pSMBr = (struct smb_com_ntransact_rsp *)buf;
-        /* ByteCount was converted from little endian in SendReceive */
-        end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
-                        (char *)&pSMBr->ByteCount;
-        data_offset = le32_to_cpu(pSMBr->DataOffset);
-        data_count = le32_to_cpu(pSMBr->DataCount);
-        parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
-        parm_count = le32_to_cpu(pSMBr->ParameterCount);
-        *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
-        *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
-        /* should we also check that parm and data areas do not overlap? */
-        if (*ppparm > end_of_smb) {
-                cFYI(1, "parms start after end of smb");
-                return -EINVAL;
-        } else if (parm_count + *ppparm > end_of_smb) {
-                cFYI(1, "parm end after end of smb");
-                return -EINVAL;
-        } else if (*ppdata > end_of_smb) {
-                cFYI(1, "data starts after end of smb");
-                return -EINVAL;
-        } else if (data_count + *ppdata > end_of_smb) {
-                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
-                        *ppdata, data_count, (data_count + *ppdata),
-                        end_of_smb, pSMBr);
-                return -EINVAL;
-        } else if (parm_count + data_count > pSMBr->ByteCount) {
-                cFYI(1, "parm count and data count larger than SMB");
-                return -EINVAL;
-        }
-        *pdatalen = data_count;
-        *pparmlen = parm_count;
-        return 0;
-}
 int
 CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
@@ -3054,7 +3006,97 @@ GetExtAttrOut:
 #endif /* CONFIG_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
+/*
+ * Initialize NT TRANSACT SMB into small smb request buffer.  This assumes that
+ * all NT TRANSACTS that we init here have total parm and data under about 400
+ * bytes (to fit in small cifs buffer size), which is the case so far, it
+ * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
+ * returned setup area) and MaxParameterCount (returned parms size) must be set
+ * by caller
+ */
+static int
+smb_init_nttransact(const __u16 sub_command, const int setup_count,
+                   const int parm_len, struct cifsTconInfo *tcon,
+                   void **ret_buf)
+{
+        int rc;
+        __u32 temp_offset;
+        struct smb_com_ntransact_req *pSMB;
+        rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
+                                (void **)&pSMB);
+        if (rc)
+                return rc;
+        *ret_buf = (void *)pSMB;
+        pSMB->Reserved = 0;
+        pSMB->TotalParameterCount = cpu_to_le32(parm_len);
+        pSMB->TotalDataCount  = 0;
+        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+        pSMB->ParameterCount = pSMB->TotalParameterCount;
+        pSMB->DataCount  = pSMB->TotalDataCount;
+        temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
+                        (setup_count * 2) - 4 /* for rfc1001 length itself */;
+        pSMB->ParameterOffset = cpu_to_le32(temp_offset);
+        pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
+        pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
+        pSMB->SubCommand = cpu_to_le16(sub_command);
+        return 0;
+}
+static int
+validate_ntransact(char *buf, char **ppparm, char **ppdata,
+                   __u32 *pparmlen, __u32 *pdatalen)
+{
+        char *end_of_smb;
+        __u32 data_count, data_offset, parm_count, parm_offset;
+        struct smb_com_ntransact_rsp *pSMBr;
+        *pdatalen = 0;
+        *pparmlen = 0;
+        if (buf == NULL)
+                return -EINVAL;
+        pSMBr = (struct smb_com_ntransact_rsp *)buf;
+        /* ByteCount was converted from little endian in SendReceive */
+        end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
+                        (char *)&pSMBr->ByteCount;
+        data_offset = le32_to_cpu(pSMBr->DataOffset);
+        data_count = le32_to_cpu(pSMBr->DataCount);
+        parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
+        parm_count = le32_to_cpu(pSMBr->ParameterCount);
+        *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
+        *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
+        /* should we also check that parm and data areas do not overlap? */
+        if (*ppparm > end_of_smb) {
+                cFYI(1, "parms start after end of smb");
+                return -EINVAL;
+        } else if (parm_count + *ppparm > end_of_smb) {
+                cFYI(1, "parm end after end of smb");
+                return -EINVAL;
+        } else if (*ppdata > end_of_smb) {
+                cFYI(1, "data starts after end of smb");
+                return -EINVAL;
+        } else if (data_count + *ppdata > end_of_smb) {
+                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
+                        *ppdata, data_count, (data_count + *ppdata),
+                        end_of_smb, pSMBr);
+                return -EINVAL;
+        } else if (parm_count + data_count > pSMBr->ByteCount) {
+                cFYI(1, "parm count and data count larger than SMB");
+                return -EINVAL;
+        }
+        *pdatalen = data_count;
+        *pparmlen = parm_count;
+        return 0;
+}
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
 CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
@@ -3087,7 +3129,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
        iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
        rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
-                         CIFS_STD_OP);
+                         0);
        cifs_stats_inc(&tcon->num_acl_get);
        if (rc) {
                cFYI(1, "Send error in QuerySecDesc = %d", rc);
@@ -3212,7 +3254,7 @@ setCifsAclRetry:
        return (rc);
 }
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
+#endif /* CONFIG_CIFS_ACL */
 /* Legacy Query Path Information call for lookup to old servers such
   as Win9x/WinME */
@@ -4869,7 +4911,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
                   __u16 fid, __u32 pid_of_opener, bool SetAllocation)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
-        char *data_offset;
        struct file_end_of_file_info *parm_data;
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
@@ -4893,8 +4934,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
        offset = param_offset + params;
-        data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
        count = sizeof(struct file_end_of_file_info);
        pSMB->MaxParameterCount = cpu_to_le16(2);
        /* BB find exact max SMB PDU from sess structure BB */
@@ -5562,7 +5601,7 @@ QAllEAsRetry:
        }
        /* make sure list_len doesn't go past end of SMB */
-        end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
+        end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
        if ((char *)ea_response_data + list_len > end_of_smb) {
                cFYI(1, "EA list appears to go beyond SMB");
                rc = -EIO;
diff --git a/fs/cifs/cn_cifs.h b/fs/cifs/cn_cifs.h
deleted file mode 100644
index ea59ccac2eb1..000000000000
--- a/fs/cifs/cn_cifs.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- *   fs/cifs/cn_cifs.h
- *
- *   Copyright (c) International Business Machines  Corp., 2002
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#ifndef _CN_CIFS_H
-#define _CN_CIFS_H
-#ifdef CONFIG_CIFS_UPCALL
-#include <linux/types.h>
-#include <linux/connector.h>
-struct cifs_upcall {
-        char signature[4]; /* CIFS */
-        enum command {
-                CIFS_GET_IP = 0x00000001,   /* get ip address for hostname */
-                CIFS_GET_SECBLOB = 0x00000002, /* get SPNEGO wrapped blob */
-        } command;
-        /* union cifs upcall data follows */
-};
-#endif /* CIFS_UPCALL */
-#endif /* _CN_CIFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 88c84a38bccb..257b6d895e20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -47,14 +47,13 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include "rfc1002pdu.h"
-#include "cn_cifs.h"
 #include "fscache.h"
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
-extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
+/* SMB echo "timeout" -- FIXME: tunable? */
-                         unsigned char *p24);
+#define SMB_ECHO_INTERVAL (60 * HZ)
 extern mempool_t *cifs_req_poolp;
@@ -65,8 +64,8 @@ struct smb_vol {
        char *UNC;
        char *UNCip;
        char *iocharset;  /* local code page for mapping to and from Unicode */
-        char source_rfc1001_name[16]; /* netbios name of client */
+        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
-        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
+        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
        uid_t cred_uid;
        uid_t linux_uid;
        gid_t linux_gid;
@@ -85,6 +84,7 @@ struct smb_vol {
        bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
        bool server_ino:1; /* use inode numbers from server ie UniqueId */
        bool direct_io:1;
+        bool strict_io:1; /* strict cache behavior */
        bool remap:1;      /* set to remap seven reserved chars in filenames */
        bool posix_paths:1; /* unset to not ask for posix pathnames. */
        bool no_linux_ext:1;
@@ -100,16 +100,26 @@ struct smb_vol {
        bool noautotune:1;
        bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
        bool fsc:1;     /* enable fscache */
+        bool mfsymlinks:1; /* use Minshall+French Symlinks */
+        bool multiuser:1;
        unsigned int rsize;
        unsigned int wsize;
        bool sockopt_tcp_nodelay:1;
        unsigned short int port;
+        unsigned long actimeo; /* attribute cache timeout (jiffies) */
        char *prepath;
+        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
        struct nls_table *local_nls;
 };
-static int ipv4_connect(struct TCP_Server_Info *server);
+/* FIXME: should these be tunable? */
-static int ipv6_connect(struct TCP_Server_Info *server);
+#define TLINK_ERROR_EXPIRE      (1 * HZ)
+#define TLINK_IDLE_EXPIRE       (600 * HZ)
+static int ip_connect(struct TCP_Server_Info *server);
+static int generic_ip_connect(struct TCP_Server_Info *server);
+static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
+static void cifs_prune_tlinks(struct work_struct *work);
 /*
 * cifs tcp session reconnection
@@ -143,7 +153,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
        /* before reconnecting the tcp session, mark the smb session (uid)
                and the tid bad so they are not used until reconnected */
-        read_lock(&cifs_tcp_ses_lock);
+        cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &server->smb_ses_list) {
                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
                ses->need_reconnect = true;
@@ -153,8 +164,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
                        tcon->need_reconnect = true;
                }
        }
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        /* do not want to be sending data on a socket we are freeing */
+        cFYI(1, "%s: tearing down socket", __func__);
        mutex_lock(&server->srv_mutex);
        if (server->ssocket) {
                cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
@@ -166,30 +179,32 @@ cifs_reconnect(struct TCP_Server_Info *server)
                sock_release(server->ssocket);
                server->ssocket = NULL;
        }
+        server->sequence_number = 0;
+        server->session_estab = false;
+        kfree(server->session_key.response);
+        server->session_key.response = NULL;
+        server->session_key.len = 0;
+        server->lstrp = jiffies;
+        mutex_unlock(&server->srv_mutex);
+        /* mark submitted MIDs for retry and issue callback */
+        cFYI(1, "%s: issuing mid callbacks", __func__);
        spin_lock(&GlobalMid_Lock);
-        list_for_each(tmp, &server->pending_mid_q) {
+        list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
-                mid_entry = list_entry(tmp, struct
+                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                                        mid_q_entry,
+                if (mid_entry->midState == MID_REQUEST_SUBMITTED)
-                                        qhead);
-                if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-                                /* Mark other intransit requests as needing
-                                   retry so we do not immediately mark the
-                                   session bad again (ie after we reconnect
-                                   below) as they timeout too */
                        mid_entry->midState = MID_RETRY_NEEDED;
-                }
+                list_del_init(&mid_entry->qhead);
+                mid_entry->callback(mid_entry);
        }
        spin_unlock(&GlobalMid_Lock);
-        mutex_unlock(&server->srv_mutex);
        while ((server->tcpStatus != CifsExiting) &&
               (server->tcpStatus != CifsGood)) {
                try_to_freeze();
-                if (server->addr.sockAddr6.sin6_family == AF_INET6)
-                        rc = ipv6_connect(server);
+                /* we should try only the port we connected to before */
-                else
+                rc = generic_ip_connect(server);
-                        rc = ipv4_connect(server);
                if (rc) {
                        cFYI(1, "reconnect error %d", rc);
                        msleep(3000);
@@ -198,12 +213,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
                        spin_lock(&GlobalMid_Lock);
                        if (server->tcpStatus != CifsExiting)
                                server->tcpStatus = CifsGood;
-                        server->sequence_number = 0;
                        spin_unlock(&GlobalMid_Lock);
-        /*              atomic_set(&server->inFlight,0);*/
-                        wake_up(&server->response_q);
                }
        }
        return rc;
 }
@@ -217,9 +230,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
 static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 {
        struct smb_t2_rsp *pSMBt;
-        int total_data_size;
-        int data_in_this_rsp;
        int remaining;
+        __u16 total_data_size, data_in_this_rsp;
        if (pSMB->Command != SMB_COM_TRANSACTION2)
                return 0;
@@ -233,8 +245,8 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        pSMBt = (struct smb_t2_rsp *)pSMB;
-        total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
+        total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
-        data_in_this_rsp = le16_to_cpu(pSMBt->t2_rsp.DataCount);
+        data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
        remaining = total_data_size - data_in_this_rsp;
@@ -260,21 +272,18 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 {
        struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
        struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB;
-        int total_data_size;
-        int total_in_buf;
-        int remaining;
-        int total_in_buf2;
        char *data_area_of_target;
        char *data_area_of_buf2;
-        __u16 byte_count;
+        int remaining;
+        __u16 byte_count, total_data_size, total_in_buf, total_in_buf2;
-        total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
+        total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
-        if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
+        if (total_data_size !=
+            get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
                cFYI(1, "total data size of primary and secondary t2 differ");
-        }
-        total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
+        total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
        remaining = total_data_size - total_in_buf;
@@ -284,28 +293,28 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        if (remaining == 0) /* nothing to do, ignore */
                return 0;
-        total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
+        total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
        if (remaining < total_in_buf2) {
                cFYI(1, "transact2 2nd response contains too much data");
        }
        /* find end of first SMB data area */
        data_area_of_target = (char *)&pSMBt->hdr.Protocol +
-                                le16_to_cpu(pSMBt->t2_rsp.DataOffset);
+                                get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
        /* validate target area */
-        data_area_of_buf2 = (char *) &pSMB2->hdr.Protocol +
+        data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
-                                        le16_to_cpu(pSMB2->t2_rsp.DataOffset);
+                                get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
        data_area_of_target += total_in_buf;
        /* copy second buffer into end of first buffer */
        memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
        total_in_buf += total_in_buf2;
-        pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf);
+        put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
-        byte_count = le16_to_cpu(BCC_LE(pTargetSMB));
+        byte_count = get_bcc_le(pTargetSMB);
        byte_count += total_in_buf2;
-        BCC_LE(pTargetSMB) = cpu_to_le16(byte_count);
+        put_bcc_le(byte_count, pTargetSMB);
        byte_count = pTargetSMB->smb_buf_length;
        byte_count += total_in_buf2;
@@ -319,7 +328,30 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
                return 0; /* we are done */
        } else /* more responses to go */
                return 1;
+}
+static void
+cifs_echo_request(struct work_struct *work)
+{
+        int rc;
+        struct TCP_Server_Info *server = container_of(work,
+                                        struct TCP_Server_Info, echo.work);
+        /*
+         * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done.
+         * Also, no need to ping if we got a response recently
+         */
+        if (server->tcpStatus != CifsGood ||
+            time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+                goto requeue_echo;
+        rc = CIFSSMBEcho(server);
+        if (rc)
+                cFYI(1, "Unable to send echo request to server: %s",
+                        server->hostname);
+requeue_echo:
+        queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
 }
 static int
@@ -333,8 +365,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
        struct msghdr smb_msg;
        struct kvec iov;
        struct socket *csocket = server->ssocket;
-        struct list_head *tmp;
+        struct list_head *tmp, *tmp2;
-        struct cifsSesInfo *ses;
        struct task_struct *task_to_wake = NULL;
        struct mid_q_entry *mid_entry;
        char temp;
@@ -387,7 +418,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                smb_msg.msg_control = NULL;
                smb_msg.msg_controllen = 0;
                pdu_length = 4; /* enough to get RFC1001 header */
 incomplete_rcv:
+                if (echo_retries > 0 &&
+                    time_after(jiffies, server->lstrp +
+                                        (echo_retries * SMB_ECHO_INTERVAL))) {
+                        cERROR(1, "Server %s has not responded in %d seconds. "
+                                  "Reconnecting...", server->hostname,
+                                  (echo_retries * SMB_ECHO_INTERVAL / HZ));
+                        cifs_reconnect(server);
+                        csocket = server->ssocket;
+                        wake_up(&server->response_q);
+                        continue;
+                }
                length =
                    kernel_recvmsg(csocket, &smb_msg,
                                &iov, 1, pdu_length, 0 /* BB other flags? */);
@@ -464,7 +508,7 @@ incomplete_rcv:
                         * initialize frame)
                         */
                        cifs_set_port((struct sockaddr *)
-                                        &server->addr.sockAddr, CIFS_PORT);
+                                        &server->dstaddr, CIFS_PORT);
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
@@ -538,19 +582,20 @@ incomplete_rcv:
                else if (reconnect == 1)
                        continue;
-                length += 4; /* account for rfc1002 hdr */
+                total_read += 4; /* account for rfc1002 hdr */
-                dump_smb(smb_buffer, length);
+                dump_smb(smb_buffer, total_read);
-                if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) {
+                if (checkSMB(smb_buffer, smb_buffer->Mid, total_read)) {
-                        cifs_dump_mem("Bad SMB: ", smb_buffer, 48);
+                        cifs_dump_mem("Bad SMB: ", smb_buffer,
+                                        total_read < 48 ? total_read : 48);
                        continue;
                }
+                mid_entry = NULL;
+                server->lstrp = jiffies;
-                task_to_wake = NULL;
                spin_lock(&GlobalMid_Lock);
-                list_for_each(tmp, &server->pending_mid_q) {
+                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
                        if ((mid_entry->mid == smb_buffer->Mid) &&
@@ -591,20 +636,19 @@ incomplete_rcv:
                                mid_entry->resp_buf = smb_buffer;
                                mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
-                                task_to_wake = mid_entry->tsk;
                                mid_entry->midState = MID_RESPONSE_RECEIVED;
 #ifdef CONFIG_CIFS_STATS2
                                mid_entry->when_received = jiffies;
 #endif
-                                /* so we do not time out requests to  server
+                                list_del_init(&mid_entry->qhead);
-                                which is still responding (since server could
+                                mid_entry->callback(mid_entry);
-                                be busy but not dead) */
-                                server->lstrp = jiffies;
                                break;
                        }
+                        mid_entry = NULL;
                }
                spin_unlock(&GlobalMid_Lock);
-                if (task_to_wake) {
+                if (mid_entry != NULL) {
                        /* Was previous buf put in mpx struct for multi-rsp? */
                        if (!isMultiRsp) {
                                /* smb buffer will be freed by user thread */
@@ -613,11 +657,10 @@ multi_t2_fnd:
                                else
                                        smallbuf = NULL;
                        }
-                        wake_up_process(task_to_wake);
                } else if (!is_valid_oplock_break(smb_buffer, server) &&
                           !isMultiRsp) {
                        cERROR(1, "No task to wake, unknown frame received! "
-                                   "NumMids %d", midCount.counter);
+                                   "NumMids %d", atomic_read(&midCount));
                        cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
                                      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
@@ -629,9 +672,9 @@ multi_t2_fnd:
        } /* end while !EXITING */
        /* take it off the list, if it's not already */
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_del_init(&server->tcp_ses_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        spin_lock(&GlobalMid_Lock);
        server->tcpStatus = CifsExiting;
@@ -665,44 +708,16 @@ multi_t2_fnd:
        if (smallbuf) /* no sense logging a debug message if NULL */
                cifs_small_buf_release(smallbuf);
-        /*
+        if (!list_empty(&server->pending_mid_q)) {
-         * BB: we shouldn't have to do any of this. It shouldn't be
-         * possible to exit from the thread with active SMB sessions
-         */
-        read_lock(&cifs_tcp_ses_lock);
-        if (list_empty(&server->pending_mid_q)) {
-                /* loop through server session structures attached to this and
-                    mark them dead */
-                list_for_each(tmp, &server->smb_ses_list) {
-                        ses = list_entry(tmp, struct cifsSesInfo,
-                                         smb_ses_list);
-                        ses->status = CifsExiting;
-                        ses->server = NULL;
-                }
-                read_unlock(&cifs_tcp_ses_lock);
-        } else {
-                /* although we can not zero the server struct pointer yet,
-                since there are active requests which may depnd on them,
-                mark the corresponding SMB sessions as exiting too */
-                list_for_each(tmp, &server->smb_ses_list) {
-                        ses = list_entry(tmp, struct cifsSesInfo,
-                                         smb_ses_list);
-                        ses->status = CifsExiting;
-                }
                spin_lock(&GlobalMid_Lock);
-                list_for_each(tmp, &server->pending_mid_q) {
+                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
-                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                        if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
+                        cFYI(1, "Clearing Mid 0x%x - issuing callback",
-                                cFYI(1, "Clearing Mid 0x%x - waking up ",
                                         mid_entry->mid);
-                                task_to_wake = mid_entry->tsk;
+                        list_del_init(&mid_entry->qhead);
-                                if (task_to_wake)
+                        mid_entry->callback(mid_entry);
-                                        wake_up_process(task_to_wake);
-                        }
                }
                spin_unlock(&GlobalMid_Lock);
-                read_unlock(&cifs_tcp_ses_lock);
                /* 1/8th of sec is more than enough time for them to exit */
                msleep(125);
        }
@@ -720,18 +735,6 @@ multi_t2_fnd:
                coming home not much else we can do but free the memory */
        }
-        /* last chance to mark ses pointers invalid
-        if there are any pointing to this (e.g
-        if a crazy root user tried to kill cifsd
-        kernel thread explicitly this might happen) */
-        /* BB: This shouldn't be necessary, see above */
-        read_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &server->smb_ses_list) {
-                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
-                ses->server = NULL;
-        }
-        read_unlock(&cifs_tcp_ses_lock);
        kfree(server->hostname);
        task_to_wake = xchg(&server->tsk, NULL);
        kfree(server);
@@ -794,24 +797,21 @@ cifs_parse_mount_options(char *options, const char *devname,
        short int override_gid = -1;
        bool uid_specified = false;
        bool gid_specified = false;
+        char *nodename = utsname()->nodename;
        separator[0] = ',';
        separator[1] = 0;
-        if (Local_System_Name[0] != 0)
+        /*
-                memcpy(vol->source_rfc1001_name, Local_System_Name, 15);
+         * does not have to be perfect mapping since field is
-        else {
+         * informational, only used for servers that do not support
-                char *nodename = utsname()->nodename;
+         * port 445 and it can be overridden at mount time
-                int n = strnlen(nodename, 15);
+         */
-                memset(vol->source_rfc1001_name, 0x20, 15);
+        memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
-                for (i = 0; i < n; i++) {
+        for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
-                        /* does not have to be perfect mapping since field is
+                vol->source_rfc1001_name[i] = toupper(nodename[i]);
-                        informational, only used for servers that do not support
-                        port 445 and it can be overridden at mount time */
+        vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
-                        vol->source_rfc1001_name[i] = toupper(nodename[i]);
-                }
-        }
-        vol->source_rfc1001_name[15] = 0;
        /* null target name indicates to use *SMBSERVR default called name
           if we end up sending RFC1001 session initialize */
        vol->target_rfc1001_name[0] = 0;
@@ -828,6 +828,8 @@ cifs_parse_mount_options(char *options, const char *devname,
        /* default to using server inode numbers where available */
        vol->server_ino = 1;
+        vol->actimeo = CIFS_DEF_ACTIMEO;
        if (!options)
                return 1;
@@ -973,13 +975,11 @@ cifs_parse_mount_options(char *options, const char *devname,
                                return 1;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
                        } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
                                        CIFSSEC_MUST_SIGN;
                        } else if (strnicmp(value, "ntlmssp", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
-#endif
                        } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
                                        CIFSSEC_MUST_SIGN;
@@ -1046,6 +1046,22 @@ cifs_parse_mount_options(char *options, const char *devname,
                                                    "long\n");
                                return 1;
                        }
+                } else if (strnicmp(data, "srcaddr", 7) == 0) {
+                        vol->srcaddr.ss_family = AF_UNSPEC;
+                        if (!value || !*value) {
+                                printk(KERN_WARNING "CIFS: srcaddr value"
+                                       " not specified.\n");
+                                return 1;       /* needs_arg; */
+                        }
+                        i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
+                                                 value, strlen(value));
+                        if (i == 0) {
+                                printk(KERN_WARNING "CIFS:  Could not parse"
+                                       " srcaddr: %s\n",
+                                       value);
+                                return 1;
+                        }
                } else if (strnicmp(data, "prefixpath", 10) == 0) {
                        if (!value || !*value) {
                                printk(KERN_WARNING
@@ -1088,6 +1104,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                } else if (!strnicmp(data, "uid", 3) && value && *value) {
                        vol->linux_uid = simple_strtoul(value, &value, 0);
                        uid_specified = true;
+                } else if (!strnicmp(data, "cruid", 5) && value && *value) {
+                        vol->cred_uid = simple_strtoul(value, &value, 0);
                } else if (!strnicmp(data, "forceuid", 8)) {
                        override_uid = 1;
                } else if (!strnicmp(data, "noforceuid", 10)) {
@@ -1140,22 +1158,22 @@ cifs_parse_mount_options(char *options, const char *devname,
                        if (!value || !*value || (*value == ' ')) {
                                cFYI(1, "invalid (empty) netbiosname");
                        } else {
-                                memset(vol->source_rfc1001_name, 0x20, 15);
+                                memset(vol->source_rfc1001_name, 0x20,
-                                for (i = 0; i < 15; i++) {
+                                        RFC1001_NAME_LEN);
-                                /* BB are there cases in which a comma can be
+                                /*
-                                valid in this workstation netbios name (and need
+                                 * FIXME: are there cases in which a comma can
-                                special handling)? */
+                                 * be valid in workstation netbios name (and
+                                 * need special handling)?
-                                /* We do not uppercase netbiosname for user */
+                                 */
+                                for (i = 0; i < RFC1001_NAME_LEN; i++) {
+                                        /* don't ucase netbiosname for user */
                                        if (value[i] == 0)
                                                break;
-                                        else
+                                        vol->source_rfc1001_name[i] = value[i];
-                                                vol->source_rfc1001_name[i] =
-                                                                value[i];
                                }
                                /* The string has 16th byte zero still from
                                set at top of the function  */
-                                if ((i == 15) && (value[i] != 0))
+                                if (i == RFC1001_NAME_LEN && value[i] != 0)
                                        printk(KERN_WARNING "CIFS: netbiosname"
                                                " longer than 15 truncated.\n");
                        }
@@ -1165,7 +1183,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                                cFYI(1, "empty server netbiosname specified");
                        } else {
                                /* last byte, type, is 0x20 for servr type */
-                                memset(vol->target_rfc1001_name, 0x20, 16);
+                                memset(vol->target_rfc1001_name, 0x20,
+                                        RFC1001_NAME_LEN_WITH_NULL);
                                for (i = 0; i < 15; i++) {
                                /* BB are there cases in which a comma can be
@@ -1182,10 +1201,20 @@ cifs_parse_mount_options(char *options, const char *devname,
                                }
                                /* The string has 16th byte zero still from
                                   set at top of the function  */
-                                if ((i == 15) && (value[i] != 0))
+                                if (i == RFC1001_NAME_LEN && value[i] != 0)
                                        printk(KERN_WARNING "CIFS: server net"
                                        "biosname longer than 15 truncated.\n");
                        }
+                } else if (strnicmp(data, "actimeo", 7) == 0) {
+                        if (value && *value) {
+                                vol->actimeo = HZ * simple_strtoul(value,
+                                                                   &value, 0);
+                                if (vol->actimeo > CIFS_MAX_ACTIMEO) {
+                                        cERROR(1, "CIFS: attribute cache"
+                                                        "timeout too large");
+                                        return 1;
+                                }
+                        }
                } else if (strnicmp(data, "credentials", 4) == 0) {
                        /* ignore */
                } else if (strnicmp(data, "version", 3) == 0) {
@@ -1303,10 +1332,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->no_psx_acl = 0;
                } else if (strnicmp(data, "noacl", 5) == 0) {
                        vol->no_psx_acl = 1;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
                } else if (strnicmp(data, "locallease", 6) == 0) {
                        vol->local_lease = 1;
-#endif
                } else if (strnicmp(data, "sign", 4) == 0) {
                        vol->secFlg |= CIFSSEC_MUST_SIGN;
                } else if (strnicmp(data, "seal", 4) == 0) {
@@ -1319,12 +1346,23 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->direct_io = 1;
                } else if (strnicmp(data, "forcedirectio", 13) == 0) {
                        vol->direct_io = 1;
+                } else if (strnicmp(data, "strictcache", 11) == 0) {
+                        vol->strict_io = 1;
                } else if (strnicmp(data, "noac", 4) == 0) {
                        printk(KERN_WARNING "CIFS: Mount option noac not "
                                "supported. Instead set "
                                "/proc/fs/cifs/LookupCacheEnabled to 0\n");
                } else if (strnicmp(data, "fsc", 3) == 0) {
+#ifndef CONFIG_CIFS_FSCACHE
+                        cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
+                                  "kernel config option set");
+                        return 1;
+#endif
                        vol->fsc = true;
+                } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
+                        vol->mfsymlinks = true;
+                } else if (strnicmp(data, "multiuser", 8) == 0) {
+                        vol->multiuser = true;
                } else
                        printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
                                                data);
@@ -1356,6 +1394,13 @@ cifs_parse_mount_options(char *options, const char *devname,
                        return 1;
                }
        }
+        if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
+                cERROR(1, "Multiuser mounts currently require krb5 "
+                          "authentication!");
+                return 1;
+        }
        if (vol->UNCip == NULL)
                vol->UNCip = &vol->UNC[2];
@@ -1374,33 +1419,100 @@ cifs_parse_mount_options(char *options, const char *devname,
        return 0;
 }
+/** Returns true if srcaddr isn't specified and rhs isn't
+ * specified, or if srcaddr is specified and
+ * matches the IP address of the rhs argument.
+ */
+static bool
+srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
+{
+        switch (srcaddr->sa_family) {
+        case AF_UNSPEC:
+                return (rhs->sa_family == AF_UNSPEC);
+        case AF_INET: {
+                struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
+                struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
+                return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
+                struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+                return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
+        }
+        default:
+                WARN_ON(1);
+                return false; /* don't expect to be here */
+        }
+}
+/*
+ * If no port is specified in addr structure, we try to match with 445 port
+ * and if it fails - with 139 ports. It should be called only if address
+ * families of server and addr are equal.
+ */
 static bool
-match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
+match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
 {
-        struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+        unsigned short int port, *sport;
-        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
        switch (addr->sa_family) {
        case AF_INET:
-                if (addr4->sin_addr.s_addr !=
+                sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
-                    server->addr.sockAddr.sin_addr.s_addr)
+                port = ((struct sockaddr_in *) addr)->sin_port;
-                        return false;
-                if (addr4->sin_port &&
-                    addr4->sin_port != server->addr.sockAddr.sin_port)
-                        return false;
                break;
        case AF_INET6:
-                if (!ipv6_addr_equal(&addr6->sin6_addr,
+                sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
-                                     &server->addr.sockAddr6.sin6_addr))
+                port = ((struct sockaddr_in6 *) addr)->sin6_port;
+                break;
+        default:
+                WARN_ON(1);
+                return false;
+        }
+        if (!port) {
+                port = htons(CIFS_PORT);
+                if (port == *sport)
+                        return true;
+                port = htons(RFC1001_PORT);
+        }
+        return port == *sport;
+}
+static bool
+match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
+              struct sockaddr *srcaddr)
+{
+        switch (addr->sa_family) {
+        case AF_INET: {
+                struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+                struct sockaddr_in *srv_addr4 =
+                                        (struct sockaddr_in *)&server->dstaddr;
+                if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
                        return false;
-                if (addr6->sin6_scope_id !=
+                break;
-                    server->addr.sockAddr6.sin6_scope_id)
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+                struct sockaddr_in6 *srv_addr6 =
+                                        (struct sockaddr_in6 *)&server->dstaddr;
+                if (!ipv6_addr_equal(&addr6->sin6_addr,
+                                     &srv_addr6->sin6_addr))
                        return false;
-                if (addr6->sin6_port &&
+                if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
-                    addr6->sin6_port != server->addr.sockAddr6.sin6_port)
                        return false;
                break;
        }
+        default:
+                WARN_ON(1);
+                return false; /* don't expect to be here */
+        }
+        if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
+                return false;
        return true;
 }
@@ -1458,29 +1570,27 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 {
        struct TCP_Server_Info *server;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-                /*
+                if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
-                 * the demux thread can exit on its own while still in CifsNew
-                 * so don't accept any sockets in that state. Since the
-                 * tcpStatus never changes back to CifsNew it's safe to check
-                 * for this without a lock.
-                 */
-                if (server->tcpStatus == CifsNew)
                        continue;
-                if (!match_address(server, addr))
+                if (!match_address(server, addr,
+                                   (struct sockaddr *)&vol->srcaddr))
+                        continue;
+                if (!match_port(server, addr))
                        continue;
                if (!match_security(server, vol))
                        continue;
                ++server->srv_count;
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                cFYI(1, "Existing tcp session with server found");
                return server;
        }
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        return NULL;
 }
@@ -1489,21 +1599,30 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
        struct task_struct *task;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if (--server->srv_count > 0) {
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return;
        }
+        put_net(cifs_net_ns(server));
        list_del_init(&server->tcp_ses_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
+        cancel_delayed_work_sync(&server->echo);
        spin_lock(&GlobalMid_Lock);
        server->tcpStatus = CifsExiting;
        spin_unlock(&GlobalMid_Lock);
+        cifs_crypto_shash_release(server);
        cifs_fscache_release_client_cookie(server);
+        kfree(server->session_key.response);
+        server->session_key.response = NULL;
+        server->session_key.len = 0;
        task = xchg(&server->tsk, NULL);
        if (task)
                force_sig(SIGKILL, task);
@@ -1556,10 +1675,17 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                goto out_err;
        }
+        rc = cifs_crypto_shash_allocate(tcp_ses);
+        if (rc) {
+                cERROR(1, "could not setup hash structures rc %d", rc);
+                goto out_err;
+        }
+        cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
        tcp_ses->hostname = extract_hostname(volume_info->UNC);
        if (IS_ERR(tcp_ses->hostname)) {
                rc = PTR_ERR(tcp_ses->hostname);
-                goto out_err;
+                goto out_err_crypto_release;
        }
        tcp_ses->noblocksnd = volume_info->noblocksnd;
@@ -1574,9 +1700,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
        memcpy(tcp_ses->server_RFC1001_name,
                volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+        tcp_ses->session_estab = false;
        tcp_ses->sequence_number = 0;
+        tcp_ses->lstrp = jiffies;
        INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
        INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
+        INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
        /*
         * at this point we are the only ones with the pointer
@@ -1584,23 +1713,24 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
         * no need to spinlock this init of tcpStatus or srv_count
         */
        tcp_ses->tcpStatus = CifsNew;
+        memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
+               sizeof(tcp_ses->srcaddr));
        ++tcp_ses->srv_count;
        if (addr.ss_family == AF_INET6) {
                cFYI(1, "attempting ipv6 connect");
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
-                memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
+                memcpy(&tcp_ses->dstaddr, sin_server6,
-                        sizeof(struct sockaddr_in6));
+                       sizeof(struct sockaddr_in6));
-                rc = ipv6_connect(tcp_ses);
+        } else
-        } else {
+                memcpy(&tcp_ses->dstaddr, sin_server,
-                memcpy(&tcp_ses->addr.sockAddr, sin_server,
+                       sizeof(struct sockaddr_in));
-                        sizeof(struct sockaddr_in));
-                rc = ipv4_connect(tcp_ses);
+        rc = ip_connect(tcp_ses);
-        }
        if (rc < 0) {
                cERROR(1, "Error connecting to socket. Aborting operation");
-                goto out_err;
+                goto out_err_crypto_release;
        }
        /*
@@ -1614,18 +1744,26 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                rc = PTR_ERR(tcp_ses->tsk);
                cERROR(1, "error %d create cifsd thread", rc);
                module_put(THIS_MODULE);
-                goto out_err;
+                goto out_err_crypto_release;
        }
        /* thread spawned, put it on the list */
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        cifs_fscache_get_client_cookie(tcp_ses);
+        /* queue echo request delayed work */
+        queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
        return tcp_ses;
+out_err_crypto_release:
+        cifs_crypto_shash_release(tcp_ses);
+        put_net(cifs_net_ns(tcp_ses));
 out_err:
        if (tcp_ses) {
                if (!IS_ERR(tcp_ses->hostname))
@@ -1642,7 +1780,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
        struct cifsSesInfo *ses;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
                switch (server->secType) {
                case Kerberos:
@@ -1662,10 +1800,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
                                continue;
                }
                ++ses->ses_count;
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return ses;
        }
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        return NULL;
 }
@@ -1676,14 +1814,14 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
        struct TCP_Server_Info *server = ses->server;
        cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if (--ses->ses_count > 0) {
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return;
        }
        list_del_init(&ses->smb_ses_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        if (ses->status == CifsGood) {
                xid = GetXid();
@@ -1699,6 +1837,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
        int rc = -ENOMEM, xid;
        struct cifsSesInfo *ses;
+        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
        xid = GetXid();
@@ -1742,12 +1882,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        /* new SMB session uses our server ref */
        ses->server = server;
-        if (server->addr.sockAddr6.sin6_family == AF_INET6)
+        if (server->dstaddr.ss_family == AF_INET6)
-                sprintf(ses->serverName, "%pI6",
+                sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
-                        &server->addr.sockAddr6.sin6_addr);
        else
-                sprintf(ses->serverName, "%pI4",
+                sprintf(ses->serverName, "%pI4", &addr->sin_addr);
-                        &server->addr.sockAddr.sin_addr.s_addr);
        if (volume_info->username)
                strncpy(ses->userName, volume_info->username,
@@ -1760,10 +1898,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
                        goto get_ses_fail;
        }
        if (volume_info->domainname) {
-                int len = strlen(volume_info->domainname);
+                ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL);
-                ses->domainName = kmalloc(len + 1, GFP_KERNEL);
+                if (!ses->domainName)
-                if (ses->domainName)
+                        goto get_ses_fail;
-                        strcpy(ses->domainName, volume_info->domainname);
        }
        ses->cred_uid = volume_info->cred_uid;
        ses->linux_uid = volume_info->linux_uid;
@@ -1778,9 +1915,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
                goto get_ses_fail;
        /* success, put it on the list */
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_add(&ses->smb_ses_list, &server->smb_ses_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        FreeXid(xid);
        return ses;
@@ -1797,7 +1934,7 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
        struct list_head *tmp;
        struct cifsTconInfo *tcon;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &ses->tcon_list) {
                tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
                if (tcon->tidStatus == CifsExiting)
@@ -1806,10 +1943,10 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
                        continue;
                ++tcon->tc_count;
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return tcon;
        }
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        return NULL;
 }
@@ -1820,14 +1957,14 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        struct cifsSesInfo *ses = tcon->ses;
        cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if (--tcon->tc_count > 0) {
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return;
        }
        list_del_init(&tcon->tcon_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        xid = GetXid();
        CIFSSMBTDis(xid, tcon);
@@ -1900,9 +2037,9 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
        tcon->nocase = volume_info->nocase;
        tcon->local_lease = volume_info->local_lease;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_add(&tcon->tcon_list, &ses->tcon_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        cifs_fscache_get_super_cookie(tcon);
@@ -1913,6 +2050,23 @@ out_fail:
        return ERR_PTR(rc);
 }
+void
+cifs_put_tlink(struct tcon_link *tlink)
+{
+        if (!tlink || IS_ERR(tlink))
+                return;
+        if (!atomic_dec_and_test(&tlink->tl_count) ||
+            test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) {
+                tlink->tl_time = jiffies;
+                return;
+        }
+        if (!IS_ERR(tlink_tcon(tlink)))
+                cifs_put_tcon(tlink_tcon(tlink));
+        kfree(tlink);
+        return;
+}
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
@@ -1997,21 +2151,135 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
 }
+static int
+bind_socket(struct TCP_Server_Info *server)
+{
+        int rc = 0;
+        if (server->srcaddr.ss_family != AF_UNSPEC) {
+                /* Bind to the specified local IP address */
+                struct socket *socket = server->ssocket;
+                rc = socket->ops->bind(socket,
+                                       (struct sockaddr *) &server->srcaddr,
+                                       sizeof(server->srcaddr));
+                if (rc < 0) {
+                        struct sockaddr_in *saddr4;
+                        struct sockaddr_in6 *saddr6;
+                        saddr4 = (struct sockaddr_in *)&server->srcaddr;
+                        saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
+                        if (saddr6->sin6_family == AF_INET6)
+                                cERROR(1, "cifs: "
+                                       "Failed to bind to: %pI6c, error: %d\n",
+                                       &saddr6->sin6_addr, rc);
+                        else
+                                cERROR(1, "cifs: "
+                                       "Failed to bind to: %pI4, error: %d\n",
+                                       &saddr4->sin_addr.s_addr, rc);
+                }
+        }
+        return rc;
+}
 static int
-ipv4_connect(struct TCP_Server_Info *server)
+ip_rfc1001_connect(struct TCP_Server_Info *server)
 {
        int rc = 0;
-        int val;
+        /*
-        bool connected = false;
+         * some servers require RFC1001 sessinit before sending
-        __be16 orig_port = 0;
+         * negprot - BB check reconnection in case where second
+         * sessinit is sent but no second negprot
+         */
+        struct rfc1002_session_packet *ses_init_buf;
+        struct smb_hdr *smb_buf;
+        ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
+                               GFP_KERNEL);
+        if (ses_init_buf) {
+                ses_init_buf->trailer.session_req.called_len = 32;
+                if (server->server_RFC1001_name &&
+                    server->server_RFC1001_name[0] != 0)
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.called_name,
+                                      server->server_RFC1001_name,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                else
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.called_name,
+                                      DEFAULT_CIFS_CALLED_NAME,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                ses_init_buf->trailer.session_req.calling_len = 32;
+                /*
+                 * calling name ends in null (byte 16) from old smb
+                 * convention.
+                 */
+                if (server->workstation_RFC1001_name &&
+                    server->workstation_RFC1001_name[0] != 0)
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.calling_name,
+                                      server->workstation_RFC1001_name,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                else
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.calling_name,
+                                      "LINUX_CIFS_CLNT",
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                ses_init_buf->trailer.session_req.scope1 = 0;
+                ses_init_buf->trailer.session_req.scope2 = 0;
+                smb_buf = (struct smb_hdr *)ses_init_buf;
+                /* sizeof RFC1002_SESSION_REQUEST with no scope */
+                smb_buf->smb_buf_length = 0x81000044;
+                rc = smb_send(server, smb_buf, 0x44);
+                kfree(ses_init_buf);
+                /*
+                 * RFC1001 layer in at least one server
+                 * requires very short break before negprot
+                 * presumably because not expecting negprot
+                 * to follow so fast.  This is a simple
+                 * solution that works without
+                 * complicating the code and causes no
+                 * significant slowing down on mount
+                 * for everyone else
+                 */
+                usleep_range(1000, 2000);
+        }
+        /*
+         * else the negprot may still work without this
+         * even though malloc failed
+         */
+        return rc;
+}
+static int
+generic_ip_connect(struct TCP_Server_Info *server)
+{
+        int rc = 0;
+        unsigned short int sport;
+        int slen, sfamily;
        struct socket *socket = server->ssocket;
+        struct sockaddr *saddr;
+        saddr = (struct sockaddr *) &server->dstaddr;
+        if (server->dstaddr.ss_family == AF_INET6) {
+                sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
+                slen = sizeof(struct sockaddr_in6);
+                sfamily = AF_INET6;
+        } else {
+                sport = ((struct sockaddr_in *) saddr)->sin_port;
+                slen = sizeof(struct sockaddr_in);
+                sfamily = AF_INET;
+        }
        if (socket == NULL) {
-                rc = sock_create_kern(PF_INET, SOCK_STREAM,
+                rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
-                                      IPPROTO_TCP, &socket);
+                                   IPPROTO_TCP, &socket, 1);
                if (rc < 0) {
                        cERROR(1, "Error %d creating socket", rc);
+                        server->ssocket = NULL;
                        return rc;
                }
@@ -2019,59 +2287,28 @@ ipv4_connect(struct TCP_Server_Info *server)
                cFYI(1, "Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
-                cifs_reclassify_socket4(socket);
+                if (sfamily == AF_INET6)
+                        cifs_reclassify_socket6(socket);
+                else
+                        cifs_reclassify_socket4(socket);
        }
-        /* user overrode default port */
+        rc = bind_socket(server);
-        if (server->addr.sockAddr.sin_port) {
+        if (rc < 0)
-                rc = socket->ops->connect(socket, (struct sockaddr *)
+                return rc;
-                                          &server->addr.sockAddr,
-                                          sizeof(struct sockaddr_in), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        if (!connected) {
-                /* save original port so we can retry user specified port
-                        later if fall back ports fail this time  */
-                orig_port = server->addr.sockAddr.sin_port;
-                /* do not retry on the same port we just failed on */
-                if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) {
-                        server->addr.sockAddr.sin_port = htons(CIFS_PORT);
-                        rc = socket->ops->connect(socket,
-                                                (struct sockaddr *)
-                                                &server->addr.sockAddr,
-                                                sizeof(struct sockaddr_in), 0);
-                        if (rc >= 0)
-                                connected = true;
-                }
-        }
-        if (!connected) {
-                server->addr.sockAddr.sin_port = htons(RFC1001_PORT);
-                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                              &server->addr.sockAddr,
-                                              sizeof(struct sockaddr_in), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        /* give up here - unless we want to retry on different
+        rc = socket->ops->connect(socket, saddr, slen, 0);
-                protocol families some day */
+        if (rc < 0) {
-        if (!connected) {
+                cFYI(1, "Error %d connecting to server", rc);
-                if (orig_port)
-                        server->addr.sockAddr.sin_port = orig_port;
-                cFYI(1, "Error %d connecting to server via ipv4", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
        }
        /*
         * Eventually check for other socket options to change from
-         *  the default. sock_setsockopt not used because it expects
+         * the default. sock_setsockopt not used because it expects
-         *  user space buffer
+         * user space buffer
         */
        socket->sk->sk_rcvtimeo = 7 * HZ;
        socket->sk->sk_sndtimeo = 5 * HZ;
@@ -2085,7 +2322,7 @@ ipv4_connect(struct TCP_Server_Info *server)
        }
        if (server->tcp_nodelay) {
-                val = 1;
+                int val = 1;
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
@@ -2096,157 +2333,39 @@ ipv4_connect(struct TCP_Server_Info *server)
                 socket->sk->sk_sndbuf,
                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
-        /* send RFC1001 sessinit */
+        if (sport == htons(RFC1001_PORT))
-        if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
+                rc = ip_rfc1001_connect(server);
-                /* some servers require RFC1001 sessinit before sending
-                negprot - BB check reconnection in case where second
-                sessinit is sent but no second negprot */
-                struct rfc1002_session_packet *ses_init_buf;
-                struct smb_hdr *smb_buf;
-                ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
-                                       GFP_KERNEL);
-                if (ses_init_buf) {
-                        ses_init_buf->trailer.session_req.called_len = 32;
-                        if (server->server_RFC1001_name &&
-                            server->server_RFC1001_name[0] != 0)
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.called_name,
-                                              server->server_RFC1001_name,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        else
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.called_name,
-                                              DEFAULT_CIFS_CALLED_NAME,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        ses_init_buf->trailer.session_req.calling_len = 32;
-                        /* calling name ends in null (byte 16) from old smb
-                        convention. */
-                        if (server->workstation_RFC1001_name &&
-                            server->workstation_RFC1001_name[0] != 0)
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.calling_name,
-                                              server->workstation_RFC1001_name,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        else
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.calling_name,
-                                              "LINUX_CIFS_CLNT",
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        ses_init_buf->trailer.session_req.scope1 = 0;
-                        ses_init_buf->trailer.session_req.scope2 = 0;
-                        smb_buf = (struct smb_hdr *)ses_init_buf;
-                        /* sizeof RFC1002_SESSION_REQUEST with no scope */
-                        smb_buf->smb_buf_length = 0x81000044;
-                        rc = smb_send(server, smb_buf, 0x44);
-                        kfree(ses_init_buf);
-                        msleep(1); /* RFC1001 layer in at least one server
-                                      requires very short break before negprot
-                                      presumably because not expecting negprot
-                                      to follow so fast.  This is a simple
-                                      solution that works without
-                                      complicating the code and causes no
-                                      significant slowing down on mount
-                                      for everyone else */
-                }
-                /* else the negprot may still work without this
-                even though malloc failed */
-        }
        return rc;
 }
 static int
-ipv6_connect(struct TCP_Server_Info *server)
+ip_connect(struct TCP_Server_Info *server)
 {
-        int rc = 0;
+        unsigned short int *sport;
-        int val;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
-        bool connected = false;
+        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
-        __be16 orig_port = 0;
-        struct socket *socket = server->ssocket;
-        if (socket == NULL) {
+        if (server->dstaddr.ss_family == AF_INET6)
-                rc = sock_create_kern(PF_INET6, SOCK_STREAM,
+                sport = &addr6->sin6_port;
-                                      IPPROTO_TCP, &socket);
+        else
-                if (rc < 0) {
+                sport = &addr->sin_port;
-                        cERROR(1, "Error %d creating ipv6 socket", rc);
-                        socket = NULL;
-                        return rc;
-                }
-                /* BB other socket options to set KEEPALIVE, NODELAY? */
-                cFYI(1, "ipv6 Socket created");
-                server->ssocket = socket;
-                socket->sk->sk_allocation = GFP_NOFS;
-                cifs_reclassify_socket6(socket);
-        }
-        /* user overrode default port */
+        if (*sport == 0) {
-        if (server->addr.sockAddr6.sin6_port) {
+                int rc;
-                rc = socket->ops->connect(socket,
-                                (struct sockaddr *) &server->addr.sockAddr6,
-                                sizeof(struct sockaddr_in6), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        if (!connected) {
-                /* save original port so we can retry user specified port
-                        later if fall back ports fail this time  */
-                orig_port = server->addr.sockAddr6.sin6_port;
-                /* do not retry on the same port we just failed on */
-                if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) {
-                        server->addr.sockAddr6.sin6_port = htons(CIFS_PORT);
-                        rc = socket->ops->connect(socket, (struct sockaddr *)
-                                        &server->addr.sockAddr6,
-                                        sizeof(struct sockaddr_in6), 0);
-                        if (rc >= 0)
-                                connected = true;
-                }
-        }
-        if (!connected) {
-                server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT);
-                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                &server->addr.sockAddr6,
-                                sizeof(struct sockaddr_in6), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        /* give up here - unless we want to retry on different
+                /* try with 445 port at first */
-                protocol families some day */
+                *sport = htons(CIFS_PORT);
-        if (!connected) {
-                if (orig_port)
-                        server->addr.sockAddr6.sin6_port = orig_port;
-                cFYI(1, "Error %d connecting to server via ipv6", rc);
-                sock_release(socket);
-                server->ssocket = NULL;
-                return rc;
-        }
-        /*
+                rc = generic_ip_connect(server);
-         * Eventually check for other socket options to change from
+                if (rc >= 0)
-         * the default. sock_setsockopt not used because it expects
+                        return rc;
-         * user space buffer
-         */
-        socket->sk->sk_rcvtimeo = 7 * HZ;
-        socket->sk->sk_sndtimeo = 5 * HZ;
-        if (server->tcp_nodelay) {
+                /* if it failed, try with 139 port */
-                val = 1;
+                *sport = htons(RFC1001_PORT);
-                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
-                                (char *)&val, sizeof(val));
-                if (rc)
-                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
-        server->ssocket = socket;
+        return generic_ip_connect(server);
-        return rc;
 }
 void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
@@ -2383,6 +2502,8 @@ convert_delimiter(char *path, char delim)
 static void setup_cifs_sb(struct smb_vol *pvolume_info,
                          struct cifs_sb_info *cifs_sb)
 {
+        INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
        if (pvolume_info->rsize > CIFSMaxBufSize) {
                cERROR(1, "rsize %d too large, using MaxBufSize",
                        pvolume_info->rsize);
@@ -2434,6 +2555,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
+        cifs_sb->actimeo = pvolume_info->actimeo;
        if (pvolume_info->noperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
        if (pvolume_info->setuids)
@@ -2462,10 +2585,23 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
        if (pvolume_info->fsc)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
+        if (pvolume_info->multiuser)
+                cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
+                                            CIFS_MOUNT_NO_PERM);
+        if (pvolume_info->strict_io)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
        if (pvolume_info->direct_io) {
                cFYI(1, "mounting share using direct i/o");
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
        }
+        if (pvolume_info->mfsymlinks) {
+                if (pvolume_info->sfu_emul) {
+                        cERROR(1,  "mount option mfsymlinks ignored if sfu "
+                                   "mount option is used");
+                } else {
+                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
+                }
+        }
        if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
                cERROR(1, "mount option dynperm ignored if cifsacl "
@@ -2552,6 +2688,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        struct TCP_Server_Info *srvTcp;
        char   *full_path;
        char *mount_data = mount_data_global;
+        struct tcon_link *tlink;
 #ifdef CONFIG_CIFS_DFS_UPCALL
        struct dfs_info3_param *referrals = NULL;
        unsigned int num_referrals = 0;
@@ -2563,6 +2700,7 @@ try_mount_again:
        pSesInfo = NULL;
        srvTcp = NULL;
        full_path = NULL;
+        tlink = NULL;
        xid = GetXid();
@@ -2638,8 +2776,6 @@ try_mount_again:
                goto remote_path_check;
        }
-        cifs_sb->tcon = tcon;
        /* do not care if following two calls succeed - informational */
        if (!tcon->ipc) {
                CIFSSMBQFSDeviceInfo(xid, tcon);
@@ -2673,13 +2809,13 @@ remote_path_check:
        /* check if a whole path (including prepath) is not remote */
        if (!rc && cifs_sb->prepathlen && tcon) {
                /* build_path_to_root works only when we have a valid tcon */
-                full_path = cifs_build_path_to_root(cifs_sb);
+                full_path = cifs_build_path_to_root(cifs_sb, tcon);
                if (full_path == NULL) {
                        rc = -ENOMEM;
                        goto mount_fail_check;
                }
                rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
-                if (rc != -EREMOTE) {
+                if (rc != 0 && rc != -EREMOTE) {
                        kfree(full_path);
                        goto mount_fail_check;
                }
@@ -2748,6 +2884,30 @@ remote_path_check:
 #endif
        }
+        if (rc)
+                goto mount_fail_check;
+        /* now, hang the tcon off of the superblock */
+        tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
+        if (tlink == NULL) {
+                rc = -ENOMEM;
+                goto mount_fail_check;
+        }
+        tlink->tl_uid = pSesInfo->linux_uid;
+        tlink->tl_tcon = tcon;
+        tlink->tl_time = jiffies;
+        set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
+        set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+        cifs_sb->master_tlink = tlink;
+        spin_lock(&cifs_sb->tlink_tree_lock);
+        tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+        spin_unlock(&cifs_sb->tlink_tree_lock);
+        queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+                                TLINK_IDLE_EXPIRE);
 mount_fail_check:
        /* on error free sesinfo and tcon struct if needed */
        if (rc) {
@@ -2786,8 +2946,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        TCONX_RSP *pSMBr;
        unsigned char *bcc_ptr;
        int rc = 0;
-        int length, bytes_left;
+        int length;
-        __u16 count;
+        __u16 bytes_left, count;
        if (ses == NULL)
                return -EIO;
@@ -2815,7 +2975,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                bcc_ptr++;              /* skip password */
                /* already aligned so no need to do it below */
        } else {
-                pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                /* BB FIXME add code to fail this if NTLMv2 or Kerberos
                   specified as required (when that support is added to
                   the vfs in the future) as only NTLM or the much
@@ -2825,16 +2985,16 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
                    (ses->server->secType == LANMAN))
-                        calc_lanman_hash(tcon->password, ses->server->cryptKey,
+                        calc_lanman_hash(tcon->password, ses->server->cryptkey,
                                         ses->server->secMode &
                                            SECMODE_PW_ENCRYPT ? true : false,
                                         bcc_ptr);
                else
 #endif /* CIFS_WEAK_PW_HASH */
-                SMBNTencrypt(tcon->password, ses->server->cryptKey,
+                rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
-                             bcc_ptr);
+                                        bcc_ptr);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
                        /* must align unicode strings */
                        *bcc_ptr = 0; /* null byte password */
@@ -2872,7 +3032,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        pSMB->ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
-                         CIFS_STD_OP);
+                         0);
        /* above now done in SendReceive */
        if ((rc == 0) && (tcon != NULL)) {
@@ -2882,7 +3042,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                tcon->need_reconnect = false;
                tcon->tid = smb_buffer_response->Tid;
                bcc_ptr = pByteArea(smb_buffer_response);
-                bytes_left = BCC(smb_buffer_response);
+                bytes_left = get_bcc(smb_buffer_response);
                length = strnlen(bcc_ptr, bytes_left - 2);
                if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
                        is_unicode = true;
@@ -2934,19 +3094,32 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
-        int rc = 0;
+        struct rb_root *root = &cifs_sb->tlink_tree;
+        struct rb_node *node;
+        struct tcon_link *tlink;
        char *tmp;
-        if (cifs_sb->tcon)
+        cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
-                cifs_put_tcon(cifs_sb->tcon);
+        spin_lock(&cifs_sb->tlink_tree_lock);
+        while ((node = rb_first(root))) {
+                tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+                cifs_get_tlink(tlink);
+                clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+                rb_erase(node, root);
+                spin_unlock(&cifs_sb->tlink_tree_lock);
+                cifs_put_tlink(tlink);
+                spin_lock(&cifs_sb->tlink_tree_lock);
+        }
+        spin_unlock(&cifs_sb->tlink_tree_lock);
-        cifs_sb->tcon = NULL;
        tmp = cifs_sb->prepath;
        cifs_sb->prepathlen = 0;
        cifs_sb->prepath = NULL;
        kfree(tmp);
-        return rc;
+        return 0;
 }
 int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
@@ -2997,6 +3170,16 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
        if (rc) {
                cERROR(1, "Send error in SessSetup = %d", rc);
        } else {
+                mutex_lock(&ses->server->srv_mutex);
+                if (!server->session_estab) {
+                        server->session_key.response = ses->auth_key.response;
+                        server->session_key.len = ses->auth_key.len;
+                        server->sequence_number = 0x2;
+                        server->session_estab = true;
+                        ses->auth_key.response = NULL;
+                }
+                mutex_unlock(&server->srv_mutex);
                cFYI(1, "CIFS Session Established successfully");
                spin_lock(&GlobalMid_Lock);
                ses->status = CifsGood;
@@ -3004,6 +3187,263 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
                spin_unlock(&GlobalMid_Lock);
        }
+        kfree(ses->auth_key.response);
+        ses->auth_key.response = NULL;
+        ses->auth_key.len = 0;
+        kfree(ses->ntlmssp);
+        ses->ntlmssp = NULL;
        return rc;
 }
+static struct cifsTconInfo *
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
+{
+        struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifsSesInfo *ses;
+        struct cifsTconInfo *tcon = NULL;
+        struct smb_vol *vol_info;
+        char username[MAX_USERNAME_SIZE + 1];
+        vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
+        if (vol_info == NULL) {
+                tcon = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid);
+        vol_info->username = username;
+        vol_info->local_nls = cifs_sb->local_nls;
+        vol_info->linux_uid = fsuid;
+        vol_info->cred_uid = fsuid;
+        vol_info->UNC = master_tcon->treeName;
+        vol_info->retry = master_tcon->retry;
+        vol_info->nocase = master_tcon->nocase;
+        vol_info->local_lease = master_tcon->local_lease;
+        vol_info->no_linux_ext = !master_tcon->unix_ext;
+        /* FIXME: allow for other secFlg settings */
+        vol_info->secFlg = CIFSSEC_MUST_KRB5;
+        /* get a reference for the same TCP session */
+        spin_lock(&cifs_tcp_ses_lock);
+        ++master_tcon->ses->server->srv_count;
+        spin_unlock(&cifs_tcp_ses_lock);
+        ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
+        if (IS_ERR(ses)) {
+                tcon = (struct cifsTconInfo *)ses;
+                cifs_put_tcp_session(master_tcon->ses->server);
+                goto out;
+        }
+        tcon = cifs_get_tcon(ses, vol_info);
+        if (IS_ERR(tcon)) {
+                cifs_put_smb_ses(ses);
+                goto out;
+        }
+        if (ses->capabilities & CAP_UNIX)
+                reset_cifs_unix_caps(0, tcon, NULL, vol_info);
+out:
+        kfree(vol_info);
+        return tcon;
+}
+static inline struct tcon_link *
+cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
+{
+        return cifs_sb->master_tlink;
+}
+struct cifsTconInfo *
+cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
+{
+        return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
+}
+static int
+cifs_sb_tcon_pending_wait(void *unused)
+{
+        schedule();
+        return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+/* find and return a tlink with given uid */
+static struct tcon_link *
+tlink_rb_search(struct rb_root *root, uid_t uid)
+{
+        struct rb_node *node = root->rb_node;
+        struct tcon_link *tlink;
+        while (node) {
+                tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+                if (tlink->tl_uid > uid)
+                        node = node->rb_left;
+                else if (tlink->tl_uid < uid)
+                        node = node->rb_right;
+                else
+                        return tlink;
+        }
+        return NULL;
+}
+/* insert a tcon_link into the tree */
+static void
+tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
+{
+        struct rb_node **new = &(root->rb_node), *parent = NULL;
+        struct tcon_link *tlink;
+        while (*new) {
+                tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
+                parent = *new;
+                if (tlink->tl_uid > new_tlink->tl_uid)
+                        new = &((*new)->rb_left);
+                else
+                        new = &((*new)->rb_right);
+        }
+        rb_link_node(&new_tlink->tl_rbnode, parent, new);
+        rb_insert_color(&new_tlink->tl_rbnode, root);
+}
+/*
+ * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
+ * current task.
+ *
+ * If the superblock doesn't refer to a multiuser mount, then just return
+ * the master tcon for the mount.
+ *
+ * First, search the rbtree for an existing tcon for this fsuid. If one
+ * exists, then check to see if it's pending construction. If it is then wait
+ * for construction to complete. Once it's no longer pending, check to see if
+ * it failed and either return an error or retry construction, depending on
+ * the timeout.
+ *
+ * If one doesn't exist then insert a new tcon_link struct into the tree and
+ * try to construct a new one.
+ */
+struct tcon_link *
+cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
+{
+        int ret;
+        uid_t fsuid = current_fsuid();
+        struct tcon_link *tlink, *newtlink;
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+                return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+        spin_lock(&cifs_sb->tlink_tree_lock);
+        tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
+        if (tlink)
+                cifs_get_tlink(tlink);
+        spin_unlock(&cifs_sb->tlink_tree_lock);
+        if (tlink == NULL) {
+                newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
+                if (newtlink == NULL)
+                        return ERR_PTR(-ENOMEM);
+                newtlink->tl_uid = fsuid;
+                newtlink->tl_tcon = ERR_PTR(-EACCES);
+                set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
+                set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
+                cifs_get_tlink(newtlink);
+                spin_lock(&cifs_sb->tlink_tree_lock);
+                /* was one inserted after previous search? */
+                tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
+                if (tlink) {
+                        cifs_get_tlink(tlink);
+                        spin_unlock(&cifs_sb->tlink_tree_lock);
+                        kfree(newtlink);
+                        goto wait_for_construction;
+                }
+                tlink = newtlink;
+                tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+                spin_unlock(&cifs_sb->tlink_tree_lock);
+        } else {
+wait_for_construction:
+                ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
+                                  cifs_sb_tcon_pending_wait,
+                                  TASK_INTERRUPTIBLE);
+                if (ret) {
+                        cifs_put_tlink(tlink);
+                        return ERR_PTR(ret);
+                }
+                /* if it's good, return it */
+                if (!IS_ERR(tlink->tl_tcon))
+                        return tlink;
+                /* return error if we tried this already recently */
+                if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
+                        cifs_put_tlink(tlink);
+                        return ERR_PTR(-EACCES);
+                }
+                if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
+                        goto wait_for_construction;
+        }
+        tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid);
+        clear_bit(TCON_LINK_PENDING, &tlink->tl_flags);
+        wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
+        if (IS_ERR(tlink->tl_tcon)) {
+                cifs_put_tlink(tlink);
+                return ERR_PTR(-EACCES);
+        }
+        return tlink;
+}
+/*
+ * periodic workqueue job that scans tcon_tree for a superblock and closes
+ * out tcons.
+ */
+static void
+cifs_prune_tlinks(struct work_struct *work)
+{
+        struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
+                                                    prune_tlinks.work);
+        struct rb_root *root = &cifs_sb->tlink_tree;
+        struct rb_node *node = rb_first(root);
+        struct rb_node *tmp;
+        struct tcon_link *tlink;
+        /*
+         * Because we drop the spinlock in the loop in order to put the tlink
+         * it's not guarded against removal of links from the tree. The only
+         * places that remove entries from the tree are this function and
+         * umounts. Because this function is non-reentrant and is canceled
+         * before umount can proceed, this is safe.
+         */
+        spin_lock(&cifs_sb->tlink_tree_lock);
+        node = rb_first(root);
+        while (node != NULL) {
+                tmp = node;
+                node = rb_next(tmp);
+                tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
+                if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
+                    atomic_read(&tlink->tl_count) != 0 ||
+                    time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
+                        continue;
+                cifs_get_tlink(tlink);
+                clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+                rb_erase(tmp, root);
+                spin_unlock(&cifs_sb->tlink_tree_lock);
+                cifs_put_tlink(tlink);
+                spin_lock(&cifs_sb->tlink_tree_lock);
+        }
+        spin_unlock(&cifs_sb->tlink_tree_lock);
+        queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+                                TLINK_IDLE_EXPIRE);
+}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f9ed0751cc12..dd5f22918c33 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -54,18 +54,18 @@ build_path_from_dentry(struct dentry *direntry)
        int dfsplen;
        char *full_path;
        char dirsep;
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        if (direntry == NULL)
                return NULL;  /* not much we can do if dentry is freed and
                we need to reopen the file after it was closed implicitly
                when the server crashed */
-        cifs_sb = CIFS_SB(direntry->d_sb);
        dirsep = CIFS_DIR_SEP(cifs_sb);
        pplen = cifs_sb->prepathlen;
-        if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
+        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
-                dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
        else
                dfsplen = 0;
 cifs_bp_rename_retry:
@@ -117,7 +117,7 @@ cifs_bp_rename_retry:
        /* BB test paths to Windows with '/' in the midst of prepath */
        if (dfsplen) {
-                strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+                strncpy(full_path, tcon->treeName, dfsplen);
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
                        int i;
                        for (i = 0; i < dfsplen; i++) {
@@ -130,146 +130,6 @@ cifs_bp_rename_retry:
        return full_path;
 }
-struct cifsFileInfo *
-cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
-                  struct file *file, struct vfsmount *mnt, unsigned int oflags)
-{
-        int oplock = 0;
-        struct cifsFileInfo *pCifsFile;
-        struct cifsInodeInfo *pCifsInode;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
-        pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-        if (pCifsFile == NULL)
-                return pCifsFile;
-        if (oplockEnabled)
-                oplock = REQ_OPLOCK;
-        pCifsFile->netfid = fileHandle;
-        pCifsFile->pid = current->tgid;
-        pCifsFile->pInode = igrab(newinode);
-        pCifsFile->mnt = mnt;
-        pCifsFile->pfile = file;
-        pCifsFile->invalidHandle = false;
-        pCifsFile->closePend = false;
-        mutex_init(&pCifsFile->fh_mutex);
-        mutex_init(&pCifsFile->lock_mutex);
-        INIT_LIST_HEAD(&pCifsFile->llist);
-        atomic_set(&pCifsFile->count, 1);
-        INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
-        write_lock(&GlobalSMBSeslock);
-        list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
-        pCifsInode = CIFS_I(newinode);
-        if (pCifsInode) {
-                /* if readable file instance put first in list*/
-                if (oflags & FMODE_READ)
-                        list_add(&pCifsFile->flist, &pCifsInode->openFileList);
-                else
-                        list_add_tail(&pCifsFile->flist,
-                                      &pCifsInode->openFileList);
-                if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                        pCifsInode->clientCanCacheAll = true;
-                        pCifsInode->clientCanCacheRead = true;
-                        cFYI(1, "Exclusive Oplock inode %p", newinode);
-                } else if ((oplock & 0xF) == OPLOCK_READ)
-                                pCifsInode->clientCanCacheRead = true;
-        }
-        write_unlock(&GlobalSMBSeslock);
-        file->private_data = pCifsFile;
-        return pCifsFile;
-}
-int cifs_posix_open(char *full_path, struct inode **pinode,
-                        struct super_block *sb, int mode, int oflags,
-                        __u32 *poplock, __u16 *pnetfid, int xid)
-{
-        int rc;
-        FILE_UNIX_BASIC_INFO *presp_data;
-        __u32 posix_flags = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifs_fattr fattr;
-        cFYI(1, "posix open %s", full_path);
-        presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
-        if (presp_data == NULL)
-                return -ENOMEM;
-/* So far cifs posix extensions can only map the following flags.
-   There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
-   so far we do not seem to need them, and we can treat them as local only */
-        if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
-                (FMODE_READ | FMODE_WRITE))
-                posix_flags = SMB_O_RDWR;
-        else if (oflags & FMODE_READ)
-                posix_flags = SMB_O_RDONLY;
-        else if (oflags & FMODE_WRITE)
-                posix_flags = SMB_O_WRONLY;
-        if (oflags & O_CREAT)
-                posix_flags |= SMB_O_CREAT;
-        if (oflags & O_EXCL)
-                posix_flags |= SMB_O_EXCL;
-        if (oflags & O_TRUNC)
-                posix_flags |= SMB_O_TRUNC;
-        /* be safe and imply O_SYNC for O_DSYNC */
-        if (oflags & O_DSYNC)
-                posix_flags |= SMB_O_SYNC;
-        if (oflags & O_DIRECTORY)
-                posix_flags |= SMB_O_DIRECTORY;
-        if (oflags & O_NOFOLLOW)
-                posix_flags |= SMB_O_NOFOLLOW;
-        if (oflags & O_DIRECT)
-                posix_flags |= SMB_O_DIRECT;
-        mode &= ~current_umask();
-        rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
-                        pnetfid, presp_data, poplock, full_path,
-                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc)
-                goto posix_open_ret;
-        if (presp_data->Type == cpu_to_le32(-1))
-                goto posix_open_ret; /* open ok, caller does qpathinfo */
-        if (!pinode)
-                goto posix_open_ret; /* caller does not need info */
-        cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
-        /* get new inode and set it up */
-        if (*pinode == NULL) {
-                cifs_fill_uniqueid(sb, &fattr);
-                *pinode = cifs_iget(sb, &fattr);
-                if (!*pinode) {
-                        rc = -ENOMEM;
-                        goto posix_open_ret;
-                }
-        } else {
-                cifs_fattr_to_inode(*pinode, &fattr);
-        }
-posix_open_ret:
-        kfree(presp_data);
-        return rc;
-}
-static void setup_cifs_dentry(struct cifsTconInfo *tcon,
-                              struct dentry *direntry,
-                              struct inode *newinode)
-{
-        if (tcon->nocase)
-                direntry->d_op = &cifs_ci_dentry_ops;
-        else
-                direntry->d_op = &cifs_dentry_ops;
-        d_instantiate(direntry, newinode);
-}
 /* Inode operations in similar order to how they appear in Linux file fs.h */
 int
@@ -291,6 +151,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        int desiredAccess = GENERIC_READ | GENERIC_WRITE;
        __u16 fileHandle;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *tcon;
        char *full_path = NULL;
        FILE_ALL_INFO *buf = NULL;
@@ -300,21 +161,26 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        xid = GetXid();
        cifs_sb = CIFS_SB(inode->i_sb);
-        tcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
-        full_path = build_path_from_dentry(direntry);
+                FreeXid(xid);
-        if (full_path == NULL) {
+                return PTR_ERR(tlink);
-                rc = -ENOMEM;
-                goto cifs_create_out;
        }
+        tcon = tlink_tcon(tlink);
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
        if (nd && (nd->flags & LOOKUP_OPEN))
-                oflags = nd->intent.open.flags;
+                oflags = nd->intent.open.file->f_flags;
        else
-                oflags = FMODE_READ | SMB_O_CREAT;
+                oflags = O_RDONLY | O_CREAT;
+        full_path = build_path_from_dentry(direntry);
+        if (full_path == NULL) {
+                rc = -ENOMEM;
+                goto cifs_create_out;
+        }
        if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -344,9 +210,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                /* if the file is going to stay open, then we
                   need to set the desired access properly */
                desiredAccess = 0;
-                if (oflags & FMODE_READ)
+                if (OPEN_FMODE(oflags) & FMODE_READ)
                        desiredAccess |= GENERIC_READ; /* is this too little? */
-                if (oflags & FMODE_WRITE)
+                if (OPEN_FMODE(oflags) & FMODE_WRITE)
                        desiredAccess |= GENERIC_WRITE;
                if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -375,7 +241,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
                create_options |= CREATE_OPTION_READONLY;
-        if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
+        if (tcon->ses->capabilities & CAP_NT_SMBS)
                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
                         desiredAccess, create_options,
                         &fileHandle, &oplock, buf, cifs_sb->local_nls,
@@ -416,10 +282,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        args.uid = NO_CHANGE_64;
                        args.gid = NO_CHANGE_64;
                }
-                CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
-                                        cifs_sb->local_nls,
+                                        current->tgid);
-                                        cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else {
                /* BB implement mode setting via Windows security
                   descriptors e.g. */
@@ -452,7 +316,7 @@ cifs_create_get_file_info:
 cifs_create_set_dentry:
        if (rc == 0)
-                setup_cifs_dentry(tcon, direntry, newinode);
+                d_instantiate(direntry, newinode);
        else
                cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
@@ -467,8 +331,7 @@ cifs_create_set_dentry:
                        goto cifs_create_out;
                }
-                pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
+                pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock);
-                                               nd->path.mnt, oflags);
                if (pfile_info == NULL) {
                        fput(filp);
                        CIFSSMBClose(xid, tcon, fileHandle);
@@ -481,6 +344,7 @@ cifs_create_set_dentry:
 cifs_create_out:
        kfree(buf);
        kfree(full_path);
+        cifs_put_tlink(tlink);
        FreeXid(xid);
        return rc;
 }
@@ -491,6 +355,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        int rc = -EPERM;
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
@@ -503,10 +368,14 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        if (!old_valid_dev(device_number))
                return -EINVAL;
-        xid = GetXid();
        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
@@ -538,10 +407,6 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                rc = cifs_get_inode_info_unix(&newinode, full_path,
                                                inode->i_sb, xid);
-                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
-                else
-                        direntry->d_op = &cifs_dentry_ops;
                if (rc == 0)
                        d_instantiate(direntry, newinode);
@@ -606,6 +471,7 @@ mknod_out:
        kfree(full_path);
        kfree(buf);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -619,6 +485,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        __u16 fileHandle = 0;
        bool posix_open = false;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct cifsFileInfo *cfile;
        struct inode *newInode = NULL;
@@ -633,7 +500,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        /* check whether path exists */
        cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                FreeXid(xid);
+                return (struct dentry *)tlink;
+        }
+        pTcon = tlink_tcon(tlink);
        /*
         * Don't allow the separator character in a path component.
@@ -644,8 +516,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                for (i = 0; i < direntry->d_name.len; i++)
                        if (direntry->d_name.name[i] == '\\') {
                                cFYI(1, "Invalid file name");
-                                FreeXid(xid);
+                                rc = -EINVAL;
-                                return ERR_PTR(-EINVAL);
+                                goto lookup_out;
                        }
        }
@@ -655,7 +527,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
         */
        if (nd && (nd->flags & LOOKUP_EXCL)) {
                d_instantiate(direntry, NULL);
-                return NULL;
+                rc = 0;
+                goto lookup_out;
        }
        /* can not grab the rename sem here since it would
@@ -663,8 +536,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        in which we already have the sb rename sem */
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
-                FreeXid(xid);
+                rc = -ENOMEM;
-                return ERR_PTR(-ENOMEM);
+                goto lookup_out;
        }
        if (direntry->d_inode != NULL) {
@@ -687,11 +560,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        if (pTcon->unix_ext) {
                if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
                     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
-                     (nd->intent.open.flags & O_CREAT)) {
+                     (nd->intent.open.file->f_flags & O_CREAT)) {
                        rc = cifs_posix_open(full_path, &newInode,
                                        parent_dir_inode->i_sb,
                                        nd->intent.open.create_mode,
-                                        nd->intent.open.flags, &oplock,
+                                        nd->intent.open.file->f_flags, &oplock,
                                        &fileHandle, xid);
                        /*
                         * The check below works around a bug in POSIX
@@ -713,10 +586,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                                parent_dir_inode->i_sb, xid, NULL);
        if ((rc == 0) && (newInode != NULL)) {
-                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
-                else
-                        direntry->d_op = &cifs_dentry_ops;
                d_add(direntry, newInode);
                if (posix_open) {
                        filp = lookup_instantiate_filp(nd, direntry,
@@ -727,9 +596,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                                goto lookup_out;
                        }
-                        cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
+                        cfile = cifs_new_fileinfo(fileHandle, filp, tlink,
-                                                  nd->path.mnt,
+                                                  oplock);
-                                                  nd->intent.open.flags);
                        if (cfile == NULL) {
                                fput(filp);
                                CIFSSMBClose(xid, pTcon, fileHandle);
@@ -744,10 +612,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        } else if (rc == -ENOENT) {
                rc = 0;
                direntry->d_time = jiffies;
-                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
-                else
-                        direntry->d_op = &cifs_dentry_ops;
                d_add(direntry, NULL);
        /*      if it was once a directory (but how can we tell?) we could do
                shrink_dcache_parent(direntry); */
@@ -759,6 +623,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 lookup_out:
        kfree(full_path);
+        cifs_put_tlink(tlink);
        FreeXid(xid);
        return ERR_PTR(rc);
 }
@@ -766,22 +631,37 @@ lookup_out:
 static int
 cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
-        int isValid = 1;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        if (direntry->d_inode) {
                if (cifs_revalidate_dentry(direntry))
                        return 0;
-        } else {
+                else
-                cFYI(1, "neg dentry 0x%p name = %s",
+                        return 1;
-                         direntry, direntry->d_name.name);
-                if (time_after(jiffies, direntry->d_time + HZ) ||
-                        !lookupCacheEnabled) {
-                        d_drop(direntry);
-                        isValid = 0;
-                }
        }
-        return isValid;
+        /*
+         * This may be nfsd (or something), anyway, we can't see the
+         * intent of this. So, since this can be for creation, drop it.
+         */
+        if (!nd)
+                return 0;
+        /*
+         * Drop the negative dentry, in order to make sure to use the
+         * case sensitive name which is specified by user if this is
+         * for creation.
+         */
+        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+                        return 0;
+        }
+        if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
+                return 0;
+        return 1;
 }
 /* static int cifs_d_delete(struct dentry *direntry)
@@ -795,12 +675,14 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 const struct dentry_operations cifs_dentry_ops = {
        .d_revalidate = cifs_d_revalidate,
+        .d_automount = cifs_dfs_d_automount,
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
-static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
+static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *q)
 {
-        struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+        struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
        unsigned long hash;
        int i;
@@ -813,21 +695,16 @@ static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
        return 0;
 }
-static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
+static int cifs_ci_compare(const struct dentry *parent,
-                           struct qstr *b)
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+        struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
-        if ((a->len == b->len) &&
+        if ((name->len == len) &&
-            (nls_strnicmp(codepage, a->name, b->name, a->len) == 0)) {
+            (nls_strnicmp(codepage, name->name, str, len) == 0))
-                /*
-                 * To preserve case, don't let an existing negative dentry's
-                 * case take precedence.  If a is not a negative dentry, this
-                 * should have no side effects
-                 */
-                memcpy((void *)a->name, b->name, a->len);
                return 0;
-        }
        return 1;
 }
@@ -835,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
        .d_revalidate = cifs_d_revalidate,
        .d_hash = cifs_ci_hash,
        .d_compare = cifs_ci_compare,
+        .d_automount = cifs_dfs_d_automount,
 };
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 0eb87026cad3..548f06230a6d 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        /* Search for server name delimiter */
        sep = memchr(hostname, '\\', len);
        if (sep)
-                len = sep - unc;
+                len = sep - hostname;
        else
                cFYI(1, "%s: probably server name is whole unc: %s",
                     __func__, unc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index de748c652d11..e964b1cd5dd0 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -60,34 +60,32 @@ static inline int cifs_convert_flags(unsigned int flags)
                FILE_READ_DATA);
 }
-static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
+static u32 cifs_posix_convert_flags(unsigned int flags)
 {
-        fmode_t posix_flags = 0;
+        u32 posix_flags = 0;
        if ((flags & O_ACCMODE) == O_RDONLY)
-                posix_flags = FMODE_READ;
+                posix_flags = SMB_O_RDONLY;
        else if ((flags & O_ACCMODE) == O_WRONLY)
-                posix_flags = FMODE_WRITE;
+                posix_flags = SMB_O_WRONLY;
-        else if ((flags & O_ACCMODE) == O_RDWR) {
+        else if ((flags & O_ACCMODE) == O_RDWR)
-                /* GENERIC_ALL is too much permission to request
+                posix_flags = SMB_O_RDWR;
-                   can cause unnecessary access denied on create */
-                /* return GENERIC_ALL; */
+        if (flags & O_CREAT)
-                posix_flags = FMODE_READ | FMODE_WRITE;
+                posix_flags |= SMB_O_CREAT;
-        }
+        if (flags & O_EXCL)
-        /* can not map O_CREAT or O_EXCL or O_TRUNC flags when
+                posix_flags |= SMB_O_EXCL;
-           reopening a file.  They had their effect on the original open */
+        if (flags & O_TRUNC)
-        if (flags & O_APPEND)
+                posix_flags |= SMB_O_TRUNC;
-                posix_flags |= (fmode_t)O_APPEND;
+        /* be safe and imply O_SYNC for O_DSYNC */
        if (flags & O_DSYNC)
-                posix_flags |= (fmode_t)O_DSYNC;
+                posix_flags |= SMB_O_SYNC;
-        if (flags & __O_SYNC)
-                posix_flags |= (fmode_t)__O_SYNC;
        if (flags & O_DIRECTORY)
-                posix_flags |= (fmode_t)O_DIRECTORY;
+                posix_flags |= SMB_O_DIRECTORY;
        if (flags & O_NOFOLLOW)
-                posix_flags |= (fmode_t)O_NOFOLLOW;
+                posix_flags |= SMB_O_NOFOLLOW;
        if (flags & O_DIRECT)
-                posix_flags |= (fmode_t)O_DIRECT;
+                posix_flags |= SMB_O_DIRECT;
        return posix_flags;
 }
@@ -106,117 +104,239 @@ static inline int cifs_get_disposition(unsigned int flags)
                return FILE_OPEN;
 }
-/* all arguments to this function must be checked for validity in caller */
+int cifs_posix_open(char *full_path, struct inode **pinode,
-static inline int
+                        struct super_block *sb, int mode, unsigned int f_flags,
-cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
+                        __u32 *poplock, __u16 *pnetfid, int xid)
-                             struct cifsInodeInfo *pCifsInode, __u32 oplock,
-                             u16 netfid)
 {
+        int rc;
+        FILE_UNIX_BASIC_INFO *presp_data;
+        __u32 posix_flags = 0;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        struct cifs_fattr fattr;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *tcon;
-        write_lock(&GlobalSMBSeslock);
+        cFYI(1, "posix open %s", full_path);
-        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
+        presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
-        if (pCifsInode == NULL) {
+        if (presp_data == NULL)
-                write_unlock(&GlobalSMBSeslock);
+                return -ENOMEM;
-                return -EINVAL;
-        }
-        if (pCifsInode->clientCanCacheRead) {
+        tlink = cifs_sb_tlink(cifs_sb);
-                /* we have the inode open somewhere else
+        if (IS_ERR(tlink)) {
-                   no need to discard cache data */
+                rc = PTR_ERR(tlink);
-                goto psx_client_can_cache;
+                goto posix_open_ret;
        }
-        /* BB FIXME need to fix this check to move it earlier into posix_open
+        tcon = tlink_tcon(tlink);
-           BB  fIX following section BB FIXME */
+        mode &= ~current_umask();
-        /* if not oplocked, invalidate inode pages if mtime or file
+        posix_flags = cifs_posix_convert_flags(f_flags);
-           size changed */
+        rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
-/*      temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+                             poplock, full_path, cifs_sb->local_nls,
-        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
+                             cifs_sb->mnt_cifs_flags &
-                           (file->f_path.dentry->d_inode->i_size ==
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
+        cifs_put_tlink(tlink);
-                cFYI(1, "inode unchanged on server");
-        } else {
+        if (rc)
-                if (file->f_path.dentry->d_inode->i_mapping) {
+                goto posix_open_ret;
-                        rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
-                        if (rc != 0)
+        if (presp_data->Type == cpu_to_le32(-1))
-                                CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
+                goto posix_open_ret; /* open ok, caller does qpathinfo */
+        if (!pinode)
+                goto posix_open_ret; /* caller does not need info */
+        cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
+        /* get new inode and set it up */
+        if (*pinode == NULL) {
+                cifs_fill_uniqueid(sb, &fattr);
+                *pinode = cifs_iget(sb, &fattr);
+                if (!*pinode) {
+                        rc = -ENOMEM;
+                        goto posix_open_ret;
                }
-                cFYI(1, "invalidating remote inode since open detected it "
+        } else {
-                         "changed");
+                cifs_fattr_to_inode(*pinode, &fattr);
-                invalidate_remote_inode(file->f_path.dentry->d_inode);
+        }
-        } */
+posix_open_ret:
-psx_client_can_cache:
+        kfree(presp_data);
-        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+        return rc;
-                pCifsInode->clientCanCacheAll = true;
-                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, "Exclusive Oplock granted on inode %p",
-                         file->f_path.dentry->d_inode);
-        } else if ((oplock & 0xF) == OPLOCK_READ)
-                pCifsInode->clientCanCacheRead = true;
-        /* will have to change the unlock if we reenable the
-           filemap_fdatawrite (which does not seem necessary */
-        write_unlock(&GlobalSMBSeslock);
-        return 0;
 }
-/* all arguments to this function must be checked for validity in caller */
+static int
-static inline int cifs_open_inode_helper(struct inode *inode,
+cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
-        struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
+             struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
-        char *full_path, int xid)
+             __u16 *pnetfid, int xid)
 {
-        struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
-        struct timespec temp;
        int rc;
+        int desiredAccess;
+        int disposition;
+        FILE_ALL_INFO *buf;
-        if (pCifsInode->clientCanCacheRead) {
+        desiredAccess = cifs_convert_flags(f_flags);
-                /* we have the inode open somewhere else
-                   no need to discard cache data */
-                goto client_can_cache;
-        }
-        /* BB need same check in cifs_create too? */
+/*********************************************************************
-        /* if not oplocked, invalidate inode pages if mtime or file
+ *  open flag mapping table:
-           size changed */
+ *
-        temp = cifs_NTtimeToUnix(buf->LastWriteTime);
+ *      POSIX Flag            CIFS Disposition
-        if (timespec_equal(&inode->i_mtime, &temp) &&
+ *      ----------            ----------------
-                           (inode->i_size ==
+ *      O_CREAT               FILE_OPEN_IF
-                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
+ *      O_CREAT | O_EXCL      FILE_CREATE
-                cFYI(1, "inode unchanged on server");
+ *      O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
-        } else {
+ *      O_TRUNC               FILE_OVERWRITE
-                if (inode->i_mapping) {
+ *      none of the above     FILE_OPEN
-                        /* BB no need to lock inode until after invalidate
+ *
-                        since namei code should already have it locked? */
+ *      Note that there is not a direct match between disposition
-                        rc = filemap_write_and_wait(inode->i_mapping);
+ *      FILE_SUPERSEDE (ie create whether or not file exists although
-                        if (rc != 0)
+ *      O_CREAT | O_TRUNC is similar but truncates the existing
-                                pCifsInode->write_behind_rc = rc;
+ *      file rather than creating a new file as FILE_SUPERSEDE does
-                }
+ *      (which uses the attributes / metadata passed in on open call)
-                cFYI(1, "invalidating remote inode since open detected it "
+ *?
-                         "changed");
+ *?  O_SYNC is a reasonable match to CIFS writethrough flag
-                invalidate_remote_inode(inode);
+ *?  and the read write flags match reasonably.  O_LARGEFILE
-        }
+ *?  is irrelevant because largefile support is always used
+ *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
+ *       O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
+ *********************************************************************/
+        disposition = cifs_get_disposition(f_flags);
-client_can_cache:
+        /* BB pass O_SYNC flag through on file attributes .. BB */
-        if (pTcon->unix_ext)
+        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        if (tcon->ses->capabilities & CAP_NT_SMBS)
+                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
+                         desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        else
+                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
+                        desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc)
+                goto out;
+        if (tcon->unix_ext)
                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
                                              xid);
        else
                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
-                                         xid, NULL);
+                                         xid, pnetfid);
-        if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                pCifsInode->clientCanCacheAll = true;
-                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, "Exclusive Oplock granted on inode %p", inode);
-        } else if ((*oplock & 0xF) == OPLOCK_READ)
-                pCifsInode->clientCanCacheRead = true;
+out:
+        kfree(buf);
        return rc;
 }
+struct cifsFileInfo *
+cifs_new_fileinfo(__u16 fileHandle, struct file *file,
+                  struct tcon_link *tlink, __u32 oplock)
+{
+        struct dentry *dentry = file->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
+        struct cifsFileInfo *pCifsFile;
+        pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+        if (pCifsFile == NULL)
+                return pCifsFile;
+        pCifsFile->count = 1;
+        pCifsFile->netfid = fileHandle;
+        pCifsFile->pid = current->tgid;
+        pCifsFile->uid = current_fsuid();
+        pCifsFile->dentry = dget(dentry);
+        pCifsFile->f_flags = file->f_flags;
+        pCifsFile->invalidHandle = false;
+        pCifsFile->tlink = cifs_get_tlink(tlink);
+        mutex_init(&pCifsFile->fh_mutex);
+        mutex_init(&pCifsFile->lock_mutex);
+        INIT_LIST_HEAD(&pCifsFile->llist);
+        INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
+        spin_lock(&cifs_file_list_lock);
+        list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
+        /* if readable file instance put first in list*/
+        if (file->f_mode & FMODE_READ)
+                list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+        else
+                list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
+        spin_unlock(&cifs_file_list_lock);
+        cifs_set_oplock_level(pCifsInode, oplock);
+        file->private_data = pCifsFile;
+        return pCifsFile;
+}
+/*
+ * Release a reference on the file private data. This may involve closing
+ * the filehandle out on the server. Must be called without holding
+ * cifs_file_list_lock.
+ */
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+        struct inode *inode = cifs_file->dentry->d_inode;
+        struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsLockInfo *li, *tmp;
+        spin_lock(&cifs_file_list_lock);
+        if (--cifs_file->count > 0) {
+                spin_unlock(&cifs_file_list_lock);
+                return;
+        }
+        /* remove it from the lists */
+        list_del(&cifs_file->flist);
+        list_del(&cifs_file->tlist);
+        if (list_empty(&cifsi->openFileList)) {
+                cFYI(1, "closing last open instance for inode %p",
+                        cifs_file->dentry->d_inode);
+                /* in strict cache mode we need invalidate mapping on the last
+                   close  because it may cause a error when we open this file
+                   again and get at least level II oplock */
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+                        CIFS_I(inode)->invalid_mapping = true;
+                cifs_set_oplock_level(cifsi, 0);
+        }
+        spin_unlock(&cifs_file_list_lock);
+        if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
+                int xid, rc;
+                xid = GetXid();
+                rc = CIFSSMBClose(xid, tcon, cifs_file->netfid);
+                FreeXid(xid);
+        }
+        /* Delete any outstanding lock records. We'll lose them when the file
+         * is closed anyway.
+         */
+        mutex_lock(&cifs_file->lock_mutex);
+        list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
+                list_del(&li->llist);
+                kfree(li);
+        }
+        mutex_unlock(&cifs_file->lock_mutex);
+        cifs_put_tlink(cifs_file->tlink);
+        dput(cifs_file->dentry);
+        kfree(cifs_file);
+}
 int cifs_open(struct inode *inode, struct file *file)
 {
        int rc = -EACCES;
@@ -224,20 +344,21 @@ int cifs_open(struct inode *inode, struct file *file)
        __u32 oplock;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *tcon;
+        struct tcon_link *tlink;
        struct cifsFileInfo *pCifsFile = NULL;
-        struct cifsInodeInfo *pCifsInode;
        char *full_path = NULL;
-        int desiredAccess;
+        bool posix_open_ok = false;
-        int disposition;
        __u16 netfid;
-        FILE_ALL_INFO *buf = NULL;
        xid = GetXid();
        cifs_sb = CIFS_SB(inode->i_sb);
-        tcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
-        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
+                FreeXid(xid);
+                return PTR_ERR(tlink);
+        }
+        tcon = tlink_tcon(tlink);
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
@@ -257,35 +378,13 @@ int cifs_open(struct inode *inode, struct file *file)
            (tcon->ses->capabilities & CAP_UNIX) &&
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-                int oflags = (int) cifs_posix_convert_flags(file->f_flags);
-                oflags |= SMB_O_CREAT;
                /* can not refresh inode info since size could be stale */
                rc = cifs_posix_open(full_path, &inode, inode->i_sb,
                                cifs_sb->mnt_file_mode /* ignored */,
-                                oflags, &oplock, &netfid, xid);
+                                file->f_flags, &oplock, &netfid, xid);
                if (rc == 0) {
                        cFYI(1, "posix open succeeded");
-                        /* no need for special case handling of setting mode
+                        posix_open_ok = true;
-                           on read only files needed here */
-                        rc = cifs_posix_open_inode_helper(inode, file,
-                                        pCifsInode, oplock, netfid);
-                        if (rc != 0) {
-                                CIFSSMBClose(xid, tcon, netfid);
-                                goto out;
-                        }
-                        pCifsFile = cifs_new_fileinfo(inode, netfid, file,
-                                                        file->f_path.mnt,
-                                                        oflags);
-                        if (pCifsFile == NULL) {
-                                CIFSSMBClose(xid, tcon, netfid);
-                                rc = -ENOMEM;
-                        }
-                        cifs_fscache_set_inode_cookie(inode, file);
-                        goto out;
                } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        if (tcon->ses->serverNOS)
                                cERROR(1, "server %s of type %s returned"
@@ -302,106 +401,42 @@ int cifs_open(struct inode *inode, struct file *file)
                   or DFS errors */
        }
-        desiredAccess = cifs_convert_flags(file->f_flags);
+        if (!posix_open_ok) {
+                rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
-/*********************************************************************
+                                  file->f_flags, &oplock, &netfid, xid);
- *  open flag mapping table:
+                if (rc)
- *
+                        goto out;
- *      POSIX Flag            CIFS Disposition
- *      ----------            ----------------
- *      O_CREAT               FILE_OPEN_IF
- *      O_CREAT | O_EXCL      FILE_CREATE
- *      O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
- *      O_TRUNC               FILE_OVERWRITE
- *      none of the above     FILE_OPEN
- *
- *      Note that there is not a direct match between disposition
- *      FILE_SUPERSEDE (ie create whether or not file exists although
- *      O_CREAT | O_TRUNC is similar but truncates the existing
- *      file rather than creating a new file as FILE_SUPERSEDE does
- *      (which uses the attributes / metadata passed in on open call)
- *?
- *?  O_SYNC is a reasonable match to CIFS writethrough flag
- *?  and the read write flags match reasonably.  O_LARGEFILE
- *?  is irrelevant because largefile support is always used
- *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
- *       O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
- *********************************************************************/
-        disposition = cifs_get_disposition(file->f_flags);
-        /* BB pass O_SYNC flag through on file attributes .. BB */
-        /* Also refresh inode by passing in file_info buf returned by SMBOpen
-           and calling get_inode_info with returned buf (at least helps
-           non-Unix server case) */
-        /* BB we can not do this if this is the second open of a file
-           and the first handle has writebehind data, we might be
-           able to simply do a filemap_fdatawrite/filemap_fdatawait first */
-        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-        if (!buf) {
-                rc = -ENOMEM;
-                goto out;
-        }
-        if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
-                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
-                         desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        else
-                rc = -EIO; /* no NT SMB support fall into legacy open below */
-        if (rc == -EIO) {
-                /* Old server, try legacy style OpenX */
-                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
-                        desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        }
-        if (rc) {
-                cFYI(1, "cifs_open returned 0x%x", rc);
-                goto out;
        }
-        rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid);
+        pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
-        if (rc != 0)
-                goto out;
-        pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
-                                        file->f_flags);
        if (pCifsFile == NULL) {
+                CIFSSMBClose(xid, tcon, netfid);
                rc = -ENOMEM;
                goto out;
        }
        cifs_fscache_set_inode_cookie(inode, file);
-        if (oplock & CIFS_CREATE_ACTION) {
+        if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
                /* time to set mode which we can not set earlier due to
                   problems creating new read-only files */
-                if (tcon->unix_ext) {
+                struct cifs_unix_set_info_args args = {
-                        struct cifs_unix_set_info_args args = {
+                        .mode   = inode->i_mode,
-                                .mode   = inode->i_mode,
+                        .uid    = NO_CHANGE_64,
-                                .uid    = NO_CHANGE_64,
+                        .gid    = NO_CHANGE_64,
-                                .gid    = NO_CHANGE_64,
+                        .ctime  = NO_CHANGE_64,
-                                .ctime  = NO_CHANGE_64,
+                        .atime  = NO_CHANGE_64,
-                                .atime  = NO_CHANGE_64,
+                        .mtime  = NO_CHANGE_64,
-                                .mtime  = NO_CHANGE_64,
+                        .device = 0,
-                                .device = 0,
+                };
-                        };
+                CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
-                        CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                                        pCifsFile->pid);
-                                               cifs_sb->local_nls,
-                                               cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                }
        }
 out:
-        kfree(buf);
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -416,14 +451,13 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
        return rc;
 }
-static int cifs_reopen_file(struct file *file, bool can_flush)
+static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
 {
        int rc = -EACCES;
        int xid;
        __u32 oplock;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *tcon;
-        struct cifsFileInfo *pCifsFile;
        struct cifsInodeInfo *pCifsInode;
        struct inode *inode;
        char *full_path = NULL;
@@ -431,11 +465,6 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        int disposition = FILE_OPEN;
        __u16 netfid;
-        if (file->private_data)
-                pCifsFile = file->private_data;
-        else
-                return -EBADF;
        xid = GetXid();
        mutex_lock(&pCifsFile->fh_mutex);
        if (!pCifsFile->invalidHandle) {
@@ -445,39 +474,24 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
                return rc;
        }
-        if (file->f_path.dentry == NULL) {
+        inode = pCifsFile->dentry->d_inode;
-                cERROR(1, "no valid name if dentry freed");
-                dump_stack();
-                rc = -EBADF;
-                goto reopen_error_exit;
-        }
-        inode = file->f_path.dentry->d_inode;
-        if (inode == NULL) {
-                cERROR(1, "inode not valid");
-                dump_stack();
-                rc = -EBADF;
-                goto reopen_error_exit;
-        }
        cifs_sb = CIFS_SB(inode->i_sb);
-        tcon = cifs_sb->tcon;
+        tcon = tlink_tcon(pCifsFile->tlink);
 /* can not grab rename sem here because various ops, including
   those that already have the rename sem can end up causing writepage
   to get called and if the server was down that means we end up here,
   and we can never tell if the caller already has the rename_sem */
-        full_path = build_path_from_dentry(file->f_path.dentry);
+        full_path = build_path_from_dentry(pCifsFile->dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-reopen_error_exit:
                mutex_unlock(&pCifsFile->fh_mutex);
                FreeXid(xid);
                return rc;
        }
        cFYI(1, "inode = 0x%p file flags 0x%x for %s",
-                 inode, file->f_flags, full_path);
+                 inode, pCifsFile->f_flags, full_path);
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
@@ -487,8 +501,14 @@ reopen_error_exit:
        if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-                int oflags = (int) cifs_posix_convert_flags(file->f_flags);
-                /* can not refresh inode info since size could be stale */
+                /*
+                 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
+                 * original open. Must mask them off for a reopen.
+                 */
+                unsigned int oflags = pCifsFile->f_flags &
+                                                ~(O_CREAT | O_EXCL | O_TRUNC);
                rc = cifs_posix_open(full_path, NULL, inode->i_sb,
                                cifs_sb->mnt_file_mode /* ignored */,
                                oflags, &oplock, &netfid, xid);
@@ -500,7 +520,7 @@ reopen_error_exit:
                   in the reconnect path it is important to retry hard */
        }
-        desiredAccess = cifs_convert_flags(file->f_flags);
+        desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
        /* Can not refresh inode by passing in file_info buf to be returned
           by SMBOpen and then calling get_inode_info with returned buf
@@ -516,49 +536,38 @@ reopen_error_exit:
                mutex_unlock(&pCifsFile->fh_mutex);
                cFYI(1, "cifs_open returned 0x%x", rc);
                cFYI(1, "oplock: %d", oplock);
-        } else {
+                goto reopen_error_exit;
-reopen_success:
-                pCifsFile->netfid = netfid;
-                pCifsFile->invalidHandle = false;
-                mutex_unlock(&pCifsFile->fh_mutex);
-                pCifsInode = CIFS_I(inode);
-                if (pCifsInode) {
-                        if (can_flush) {
-                                rc = filemap_write_and_wait(inode->i_mapping);
-                                if (rc != 0)
-                                        CIFS_I(inode)->write_behind_rc = rc;
-                        /* temporarily disable caching while we
-                           go to server to get inode info */
-                                pCifsInode->clientCanCacheAll = false;
-                                pCifsInode->clientCanCacheRead = false;
-                                if (tcon->unix_ext)
-                                        rc = cifs_get_inode_info_unix(&inode,
-                                                full_path, inode->i_sb, xid);
-                                else
-                                        rc = cifs_get_inode_info(&inode,
-                                                full_path, NULL, inode->i_sb,
-                                                xid, NULL);
-                        } /* else we are writing out data to server already
-                             and could deadlock if we tried to flush data, and
-                             since we do not know if we have data that would
-                             invalidate the current end of file on the server
-                             we can not go to the server to get the new inod
-                             info */
-                        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                                pCifsInode->clientCanCacheAll = true;
-                                pCifsInode->clientCanCacheRead = true;
-                                cFYI(1, "Exclusive Oplock granted on inode %p",
-                                         file->f_path.dentry->d_inode);
-                        } else if ((oplock & 0xF) == OPLOCK_READ) {
-                                pCifsInode->clientCanCacheRead = true;
-                                pCifsInode->clientCanCacheAll = false;
-                        } else {
-                                pCifsInode->clientCanCacheRead = false;
-                                pCifsInode->clientCanCacheAll = false;
-                        }
-                        cifs_relock_file(pCifsFile);
-                }
        }
+reopen_success:
+        pCifsFile->netfid = netfid;
+        pCifsFile->invalidHandle = false;
+        mutex_unlock(&pCifsFile->fh_mutex);
+        pCifsInode = CIFS_I(inode);
+        if (can_flush) {
+                rc = filemap_write_and_wait(inode->i_mapping);
+                mapping_set_error(inode->i_mapping, rc);
+                if (tcon->unix_ext)
+                        rc = cifs_get_inode_info_unix(&inode,
+                                full_path, inode->i_sb, xid);
+                else
+                        rc = cifs_get_inode_info(&inode,
+                                full_path, NULL, inode->i_sb,
+                                xid, NULL);
+        } /* else we are writing out data to server already
+             and could deadlock if we tried to flush data, and
+             since we do not know if we have data that would
+             invalidate the current end of file on the server
+             we can not go to the server to get the new inod
+             info */
+        cifs_set_oplock_level(pCifsInode, oplock);
+        cifs_relock_file(pCifsFile);
+reopen_error_exit:
        kfree(full_path);
        FreeXid(xid);
        return rc;
@@ -566,79 +575,11 @@ reopen_success:
 int cifs_close(struct inode *inode, struct file *file)
 {
-        int rc = 0;
+        cifsFileInfo_put(file->private_data);
-        int xid, timeout;
+        file->private_data = NULL;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
-        struct cifsFileInfo *pSMBFile = file->private_data;
-        xid = GetXid();
+        /* return code from the ->release op is always ignored */
+        return 0;
-        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
-        if (pSMBFile) {
-                struct cifsLockInfo *li, *tmp;
-                write_lock(&GlobalSMBSeslock);
-                pSMBFile->closePend = true;
-                if (pTcon) {
-                        /* no sense reconnecting to close a file that is
-                           already closed */
-                        if (!pTcon->need_reconnect) {
-                                write_unlock(&GlobalSMBSeslock);
-                                timeout = 2;
-                                while ((atomic_read(&pSMBFile->count) != 1)
-                                        && (timeout <= 2048)) {
-                                        /* Give write a better chance to get to
-                                        server ahead of the close.  We do not
-                                        want to add a wait_q here as it would
-                                        increase the memory utilization as
-                                        the struct would be in each open file,
-                                        but this should give enough time to
-                                        clear the socket */
-                                        cFYI(DBG2, "close delay, write pending");
-                                        msleep(timeout);
-                                        timeout *= 4;
-                                }
-                                if (!pTcon->need_reconnect &&
-                                    !pSMBFile->invalidHandle)
-                                        rc = CIFSSMBClose(xid, pTcon,
-                                                  pSMBFile->netfid);
-                        } else
-                                write_unlock(&GlobalSMBSeslock);
-                } else
-                        write_unlock(&GlobalSMBSeslock);
-                /* Delete any outstanding lock records.
-                   We'll lose them when the file is closed anyway. */
-                mutex_lock(&pSMBFile->lock_mutex);
-                list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
-                        list_del(&li->llist);
-                        kfree(li);
-                }
-                mutex_unlock(&pSMBFile->lock_mutex);
-                write_lock(&GlobalSMBSeslock);
-                list_del(&pSMBFile->flist);
-                list_del(&pSMBFile->tlist);
-                write_unlock(&GlobalSMBSeslock);
-                cifsFileInfo_put(file->private_data);
-                file->private_data = NULL;
-        } else
-                rc = -EBADF;
-        read_lock(&GlobalSMBSeslock);
-        if (list_empty(&(CIFS_I(inode)->openFileList))) {
-                cFYI(1, "closing last open instance for inode %p", inode);
-                /* if the file is not open we do not know if we can cache info
-                   on this inode, much less write behind and read ahead */
-                CIFS_I(inode)->clientCanCacheRead = false;
-                CIFS_I(inode)->clientCanCacheAll  = false;
-        }
-        read_unlock(&GlobalSMBSeslock);
-        if ((rc == 0) && CIFS_I(inode)->write_behind_rc)
-                rc = CIFS_I(inode)->write_behind_rc;
-        FreeXid(xid);
-        return rc;
 }
 int cifs_closedir(struct inode *inode, struct file *file)
@@ -653,25 +594,21 @@ int cifs_closedir(struct inode *inode, struct file *file)
        xid = GetXid();
        if (pCFileStruct) {
-                struct cifsTconInfo *pTcon;
+                struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
-                struct cifs_sb_info *cifs_sb =
-                        CIFS_SB(file->f_path.dentry->d_sb);
-                pTcon = cifs_sb->tcon;
                cFYI(1, "Freeing private data in close dir");
-                write_lock(&GlobalSMBSeslock);
+                spin_lock(&cifs_file_list_lock);
                if (!pCFileStruct->srch_inf.endOfSearch &&
                    !pCFileStruct->invalidHandle) {
                        pCFileStruct->invalidHandle = true;
-                        write_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
                        rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
                        cFYI(1, "Closing uncompleted readdir with rc %d",
                                 rc);
                        /* not much we can do if it fails anyway, ignore rc */
                        rc = 0;
                } else
-                        write_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
                ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
                if (ptmp) {
                        cFYI(1, "closedir free smb buf in srch struct");
@@ -681,6 +618,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
                        else
                                cifs_buf_release(ptmp);
                }
+                cifs_put_tlink(pCFileStruct->tlink);
                kfree(file->private_data);
                file->private_data = NULL;
        }
@@ -767,13 +705,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                cFYI(1, "Unknown type of lock");
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        tcon = cifs_sb->tcon;
+        tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
-        if (file->private_data == NULL) {
-                rc = -EBADF;
-                FreeXid(xid);
-                return rc;
-        }
        netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
        if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -799,12 +731,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                /* BB we could chain these into one lock request BB */
                rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
-                                 0, 1, lockType, 0 /* wait flag */ );
+                                 0, 1, lockType, 0 /* wait flag */, 0);
                if (rc == 0) {
                        rc = CIFSSMBLock(xid, tcon, netfid, length,
                                         pfLock->fl_start, 1 /* numUnlock */ ,
                                         0 /* numLock */ , lockType,
-                                         0 /* wait flag */ );
+                                         0 /* wait flag */, 0);
                        pfLock->fl_type = F_UNLCK;
                        if (rc != 0)
                                cERROR(1, "Error unlocking previously locked "
@@ -821,13 +753,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                rc = CIFSSMBLock(xid, tcon, netfid, length,
                                        pfLock->fl_start, 0, 1,
                                        lockType | LOCKING_ANDX_SHARED_LOCK,
-                                        0 /* wait flag */);
+                                        0 /* wait flag */, 0);
                                if (rc == 0) {
                                        rc = CIFSSMBLock(xid, tcon, netfid,
                                                length, pfLock->fl_start, 1, 0,
                                                lockType |
                                                LOCKING_ANDX_SHARED_LOCK,
-                                                0 /* wait flag */);
+                                                0 /* wait flag */, 0);
                                        pfLock->fl_type = F_RDLCK;
                                        if (rc != 0)
                                                cERROR(1, "Error unlocking "
@@ -870,8 +802,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                if (numLock) {
                        rc = CIFSSMBLock(xid, tcon, netfid, length,
-                                        pfLock->fl_start,
+                                         pfLock->fl_start, 0, numLock, lockType,
-                                        0, numLock, lockType, wait_flag);
+                                         wait_flag, 0);
                        if (rc == 0) {
                                /* For Windows locks we must store them. */
@@ -891,9 +823,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                                (pfLock->fl_start + length) >=
                                                (li->offset + li->length)) {
                                        stored_rc = CIFSSMBLock(xid, tcon,
-                                                        netfid,
+                                                        netfid, li->length,
-                                                        li->length, li->offset,
+                                                        li->offset, 1, 0,
-                                                        1, 0, li->type, false);
+                                                        li->type, false, 0);
                                        if (stored_rc)
                                                rc = stored_rc;
                                        else {
@@ -912,31 +844,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        return rc;
 }
-/*
- * Set the timeout on write requests past EOF. For some servers (Windows)
- * these calls can be very long.
- *
- * If we're writing >10M past the EOF we give a 180s timeout. Anything less
- * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
- * The 10M cutoff is totally arbitrary. A better scheme for this would be
- * welcome if someone wants to suggest one.
- *
- * We may be able to do a better job with this if there were some way to
- * declare that a file should be sparse.
- */
-static int
-cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
-{
-        if (offset <= cifsi->server_eof)
-                return CIFS_STD_OP;
-        else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
-                return CIFS_VLONG_OP;
-        else
-                return CIFS_LONG_OP;
-}
 /* update the file size (if needed) after a write */
-static void
+void
 cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
                      unsigned int bytes_written)
 {
@@ -949,25 +858,26 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        size_t write_size, loff_t *poffset)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        int rc = 0;
        unsigned int bytes_written = 0;
        unsigned int total_written;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        int xid, long_op;
+        int xid;
        struct cifsFileInfo *open_file;
-        struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
        /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
           *poffset, file->f_path.dentry->d_name.name); */
        if (file->private_data == NULL)
                return -EBADF;
        open_file = file->private_data;
+        pTcon = tlink_tcon(open_file->tlink);
        rc = generic_write_checks(file, poffset, &write_size, 0);
        if (rc)
@@ -975,7 +885,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        xid = GetXid();
-        long_op = cifs_write_timeout(cifsi, *poffset);
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
                rc = -EAGAIN;
@@ -988,19 +897,12 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                           we blocked so return what we managed to write */
                                return total_written;
                        }
-                        if (open_file->closePend) {
-                                FreeXid(xid);
-                                if (total_written)
-                                        return total_written;
-                                else
-                                        return -EBADF;
-                        }
                        if (open_file->invalidHandle) {
                                /* we could deadlock if we called
                                   filemap_fdatawait from here so tell
                                   reopen_file not to flush data to server
                                   now */
-                                rc = cifs_reopen_file(file, false);
+                                rc = cifs_reopen_file(open_file, false);
                                if (rc != 0)
                                        break;
                        }
@@ -1010,7 +912,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                                min_t(const int, cifs_sb->wsize,
                                      write_size - total_written),
                                *poffset, &bytes_written,
-                                NULL, write_data + total_written, long_op);
+                                NULL, write_data + total_written, 0);
                }
                if (rc || (bytes_written == 0)) {
                        if (total_written)
@@ -1023,83 +925,57 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                        cifs_update_eof(cifsi, *poffset, bytes_written);
                        *poffset += bytes_written;
                }
-                long_op = CIFS_STD_OP; /* subsequent writes fast -
-                                    15 seconds is plenty */
        }
        cifs_stats_bytes_written(pTcon, total_written);
-        /* since the write may have blocked check these pointers again */
-        if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
-                struct inode *inode = file->f_path.dentry->d_inode;
 /* Do not update local mtime - server will set its actual value on write
- *              inode->i_ctime = inode->i_mtime =
+ *      inode->i_ctime = inode->i_mtime =
- *                      current_fs_time(inode->i_sb);*/
+ *              current_fs_time(inode->i_sb);*/
-                if (total_written > 0) {
+        if (total_written > 0) {
-                        spin_lock(&inode->i_lock);
+                spin_lock(&inode->i_lock);
-                        if (*poffset > file->f_path.dentry->d_inode->i_size)
+                if (*poffset > inode->i_size)
-                                i_size_write(file->f_path.dentry->d_inode,
+                        i_size_write(inode, *poffset);
-                                        *poffset);
+                spin_unlock(&inode->i_lock);
-                        spin_unlock(&inode->i_lock);
-                }
-                mark_inode_dirty_sync(file->f_path.dentry->d_inode);
        }
+        mark_inode_dirty_sync(inode);
        FreeXid(xid);
        return total_written;
 }
-static ssize_t cifs_write(struct file *file, const char *write_data,
+static ssize_t cifs_write(struct cifsFileInfo *open_file,
-                          size_t write_size, loff_t *poffset)
+                          const char *write_data, size_t write_size,
+                          loff_t *poffset)
 {
        int rc = 0;
        unsigned int bytes_written = 0;
        unsigned int total_written;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        int xid, long_op;
+        int xid;
-        struct cifsFileInfo *open_file;
+        struct dentry *dentry = open_file->dentry;
-        struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
+        struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
+        cifs_sb = CIFS_SB(dentry->d_sb);
        cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
-           *poffset, file->f_path.dentry->d_name.name);
+           *poffset, dentry->d_name.name);
-        if (file->private_data == NULL)
+        pTcon = tlink_tcon(open_file->tlink);
-                return -EBADF;
-        open_file = file->private_data;
        xid = GetXid();
-        long_op = cifs_write_timeout(cifsi, *poffset);
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
-                        if (file->private_data == NULL) {
-                                /* file has been closed on us */
-                                FreeXid(xid);
-                        /* if we have gotten here we have written some data
-                           and blocked, and the file has been freed on us
-                           while we blocked so return what we managed to
-                           write */
-                                return total_written;
-                        }
-                        if (open_file->closePend) {
-                                FreeXid(xid);
-                                if (total_written)
-                                        return total_written;
-                                else
-                                        return -EBADF;
-                        }
                        if (open_file->invalidHandle) {
                                /* we could deadlock if we called
                                   filemap_fdatawait from here so tell
                                   reopen_file not to flush data to
                                   server now */
-                                rc = cifs_reopen_file(file, false);
+                                rc = cifs_reopen_file(open_file, false);
                                if (rc != 0)
                                        break;
                        }
@@ -1119,7 +995,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
                                rc = CIFSSMBWrite2(xid, pTcon,
                                                open_file->netfid, len,
                                                *poffset, &bytes_written,
-                                                iov, 1, long_op);
+                                                iov, 1, 0);
                        } else
                                rc = CIFSSMBWrite(xid, pTcon,
                                         open_file->netfid,
@@ -1127,7 +1003,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
                                               write_size - total_written),
                                         *poffset, &bytes_written,
                                         write_data + total_written,
-                                         NULL, long_op);
+                                         NULL, 0);
                }
                if (rc || (bytes_written == 0)) {
                        if (total_written)
@@ -1140,49 +1016,44 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
                        cifs_update_eof(cifsi, *poffset, bytes_written);
                        *poffset += bytes_written;
                }
-                long_op = CIFS_STD_OP; /* subsequent writes fast -
-                                    15 seconds is plenty */
        }
        cifs_stats_bytes_written(pTcon, total_written);
-        /* since the write may have blocked check these pointers again */
+        if (total_written > 0) {
-        if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
+                spin_lock(&dentry->d_inode->i_lock);
-/*BB We could make this contingent on superblock ATIME flag too */
+                if (*poffset > dentry->d_inode->i_size)
-/*              file->f_path.dentry->d_inode->i_ctime =
+                        i_size_write(dentry->d_inode, *poffset);
-                file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME;*/
+                spin_unlock(&dentry->d_inode->i_lock);
-                if (total_written > 0) {
-                        spin_lock(&file->f_path.dentry->d_inode->i_lock);
-                        if (*poffset > file->f_path.dentry->d_inode->i_size)
-                                i_size_write(file->f_path.dentry->d_inode,
-                                             *poffset);
-                        spin_unlock(&file->f_path.dentry->d_inode->i_lock);
-                }
-                mark_inode_dirty_sync(file->f_path.dentry->d_inode);
        }
+        mark_inode_dirty_sync(dentry->d_inode);
        FreeXid(xid);
        return total_written;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
-struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
+                                        bool fsuid_only)
 {
        struct cifsFileInfo *open_file = NULL;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+        /* only filter by fsuid on multiuser mounts */
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+                fsuid_only = false;
-        read_lock(&GlobalSMBSeslock);
+        spin_lock(&cifs_file_list_lock);
        /* we could simply get the first_list_entry since write-only entries
           are always at the end of the list but since the first entry might
           have a close pending, we go through the whole list */
        list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-                if (open_file->closePend)
+                if (fsuid_only && open_file->uid != current_fsuid())
                        continue;
-                if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) ||
+                if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
-                    (open_file->pfile->f_flags & O_RDONLY))) {
                        if (!open_file->invalidHandle) {
                                /* found a good file */
                                /* lock it so it will not be closed on us */
                                cifsFileInfo_get(open_file);
-                                read_unlock(&GlobalSMBSeslock);
+                                spin_unlock(&cifs_file_list_lock);
                                return open_file;
                        } /* else might as well continue, and look for
                             another, or simply have the caller reopen it
@@ -1190,14 +1061,15 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
                } else /* write only file */
                        break; /* write only files are last so must be done */
        }
-        read_unlock(&GlobalSMBSeslock);
+        spin_unlock(&cifs_file_list_lock);
        return NULL;
 }
-#endif
-struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
+struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
+                                        bool fsuid_only)
 {
        struct cifsFileInfo *open_file;
+        struct cifs_sb_info *cifs_sb;
        bool any_available = false;
        int rc;
@@ -1211,53 +1083,41 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
                return NULL;
        }
-        read_lock(&GlobalSMBSeslock);
+        cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+        /* only filter by fsuid on multiuser mounts */
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+                fsuid_only = false;
+        spin_lock(&cifs_file_list_lock);
 refind_writable:
        list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-                if (open_file->closePend ||
+                if (!any_available && open_file->pid != current->tgid)
-                    (!any_available && open_file->pid != current->tgid))
                        continue;
+                if (fsuid_only && open_file->uid != current_fsuid())
-                if (open_file->pfile &&
+                        continue;
-                    ((open_file->pfile->f_flags & O_RDWR) ||
+                if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
-                     (open_file->pfile->f_flags & O_WRONLY))) {
                        cifsFileInfo_get(open_file);
                        if (!open_file->invalidHandle) {
                                /* found a good writable file */
-                                read_unlock(&GlobalSMBSeslock);
+                                spin_unlock(&cifs_file_list_lock);
                                return open_file;
                        }
-                        read_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
                        /* Had to unlock since following call can block */
-                        rc = cifs_reopen_file(open_file->pfile, false);
+                        rc = cifs_reopen_file(open_file, false);
-                        if (!rc) {
+                        if (!rc)
-                                if (!open_file->closePend)
+                                return open_file;
-                                        return open_file;
-                                else { /* start over in case this was deleted */
-                                       /* since the list could be modified */
-                                        read_lock(&GlobalSMBSeslock);
-                                        cifsFileInfo_put(open_file);
-                                        goto refind_writable;
-                                }
-                        }
-                        /* if it fails, try another handle if possible -
+                        /* if it fails, try another handle if possible */
-                        (we can not do this if closePending since
-                        loop could be modified - in which case we
-                        have to start at the beginning of the list
-                        again. Note that it would be bad
-                        to hold up writepages here (rather than
-                        in caller) with continuous retries */
                        cFYI(1, "wp failed on reopen file");
-                        read_lock(&GlobalSMBSeslock);
-                        /* can not use this handle, no write
-                           pending on this one after all */
                        cifsFileInfo_put(open_file);
-                        if (open_file->closePend) /* list could have changed */
+                        spin_lock(&cifs_file_list_lock);
-                                goto refind_writable;
                        /* else we simply continue to the next entry. Thus
                           we do not loop on reopen errors.  If we
                           can not reopen the file, for example if we
@@ -1272,7 +1132,7 @@ refind_writable:
                any_available = true;
                goto refind_writable;
        }
-        read_unlock(&GlobalSMBSeslock);
+        spin_unlock(&cifs_file_list_lock);
        return NULL;
 }
@@ -1283,8 +1143,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        char *write_data;
        int rc = -EFAULT;
        int bytes_written = 0;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
        struct inode *inode;
        struct cifsFileInfo *open_file;
@@ -1292,8 +1150,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
                return -EFAULT;
        inode = page->mapping->host;
-        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
        offset += (loff_t)from;
        write_data = kmap(page);
@@ -1314,10 +1170,10 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        if (mapping->host->i_size - offset < (loff_t)to)
                to = (unsigned)(mapping->host->i_size - offset);
-        open_file = find_writable_file(CIFS_I(mapping->host));
+        open_file = find_writable_file(CIFS_I(mapping->host), false);
        if (open_file) {
-                bytes_written = cifs_write(open_file->pfile, write_data,
+                bytes_written = cifs_write(open_file, write_data,
-                                           to-from, &offset);
+                                           to - from, &offset);
                cifsFileInfo_put(open_file);
                /* Does mm or vfs already set times? */
                inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1337,7 +1193,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 static int cifs_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        unsigned int bytes_to_write;
        unsigned int bytes_written;
        struct cifs_sb_info *cifs_sb;
@@ -1352,12 +1207,13 @@ static int cifs_writepages(struct address_space *mapping,
        int nr_pages;
        __u64 offset = 0;
        struct cifsFileInfo *open_file;
+        struct cifsTconInfo *tcon;
        struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
        struct page *page;
        struct pagevec pvec;
        int rc = 0;
        int scanned = 0;
-        int xid, long_op;
+        int xid;
        cifs_sb = CIFS_SB(mapping->host->i_sb);
@@ -1368,27 +1224,30 @@ static int cifs_writepages(struct address_space *mapping,
        if (cifs_sb->wsize < PAGE_CACHE_SIZE)
                return generic_writepages(mapping, wbc);
-        if ((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
-                if (cifs_sb->tcon->ses->server->secMode &
-                                (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                        if (!experimEnabled)
-                                return generic_writepages(mapping, wbc);
        iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
        if (iov == NULL)
                return generic_writepages(mapping, wbc);
        /*
-         * BB: Is this meaningful for a non-block-device file system?
+         * if there's no open file, then this is likely to fail too,
-         * If it is, we should test it again after we do I/O
+         * but it'll at least handle the return. Maybe it should be
+         * a BUG() instead?
         */
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+        open_file = find_writable_file(CIFS_I(mapping->host), false);
-                wbc->encountered_congestion = 1;
+        if (!open_file) {
                kfree(iov);
-                return 0;
+                return generic_writepages(mapping, wbc);
        }
+        tcon = tlink_tcon(open_file->tlink);
+        if (!experimEnabled && tcon->ses->server->secMode &
+                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+                cifsFileInfo_put(open_file);
+                kfree(iov);
+                return generic_writepages(mapping, wbc);
+        }
+        cifsFileInfo_put(open_file);
        xid = GetXid();
        pagevec_init(&pvec, 0);
@@ -1492,52 +1351,67 @@ retry:
                                break;
                }
                if (n_iov) {
-                        /* Search for a writable handle every time we call
+retry_write:
-                         * CIFSSMBWrite2.  We can't rely on the last handle
+                        open_file = find_writable_file(CIFS_I(mapping->host),
-                         * we used to still be valid
+                                                        false);
-                         */
-                        open_file = find_writable_file(CIFS_I(mapping->host));
                        if (!open_file) {
                                cERROR(1, "No writable handles for inode");
                                rc = -EBADF;
                        } else {
-                                long_op = cifs_write_timeout(cifsi, offset);
+                                rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
-                                rc = CIFSSMBWrite2(xid, cifs_sb->tcon,
-                                                   open_file->netfid,
                                                   bytes_to_write, offset,
                                                   &bytes_written, iov, n_iov,
-                                                   long_op);
+                                                   0);
                                cifsFileInfo_put(open_file);
-                                cifs_update_eof(cifsi, offset, bytes_written);
+                        }
-                                if (rc || bytes_written < bytes_to_write) {
+                        cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
-                                        cERROR(1, "Write2 ret %d, wrote %d",
-                                                  rc, bytes_written);
+                        /*
-                                        /* BB what if continued retry is
+                         * For now, treat a short write as if nothing got
-                                           requested via mount flags? */
+                         * written. A zero length write however indicates
-                                        if (rc == -ENOSPC)
+                         * ENOSPC or EFBIG. We have no way to know which
-                                                set_bit(AS_ENOSPC, &mapping->flags);
+                         * though, so call it ENOSPC for now. EFBIG would
-                                        else
+                         * get translated to AS_EIO anyway.
-                                                set_bit(AS_EIO, &mapping->flags);
+                         *
-                                } else {
+                         * FIXME: make it take into account the data that did
-                                        cifs_stats_bytes_written(cifs_sb->tcon,
+                         *        get written
-                                                                 bytes_written);
+                         */
-                                }
+                        if (rc == 0) {
+                                if (bytes_written == 0)
+                                        rc = -ENOSPC;
+                                else if (bytes_written < bytes_to_write)
+                                        rc = -EAGAIN;
                        }
+                        /* retry on data-integrity flush */
+                        if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
+                                goto retry_write;
+                        /* fix the stats and EOF */
+                        if (bytes_written > 0) {
+                                cifs_stats_bytes_written(tcon, bytes_written);
+                                cifs_update_eof(cifsi, offset, bytes_written);
+                        }
                        for (i = 0; i < n_iov; i++) {
                                page = pvec.pages[first + i];
-                                /* Should we also set page error on
+                                /* on retryable write error, redirty page */
-                                success rc but too little data written? */
+                                if (rc == -EAGAIN)
-                                /* BB investigate retry logic on temporary
+                                        redirty_page_for_writepage(wbc, page);
-                                server crash cases and how recovery works
+                                else if (rc != 0)
-                                when page marked as error */
-                                if (rc)
                                        SetPageError(page);
                                kunmap(page);
                                unlock_page(page);
                                end_page_writeback(page);
                                page_cache_release(page);
                        }
+                        if (rc != -EAGAIN)
+                                mapping_set_error(mapping, rc);
+                        else
+                                rc = 0;
                        if ((wbc->nr_to_write -= n_iov) <= 0)
                                done = 1;
                        index = next;
@@ -1624,7 +1498,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
                /* BB check if anything else missing out of ppw
                   such as updating last write time */
                page_data = kmap(page);
-                rc = cifs_write(file, page_data + offset, copied, &pos);
+                rc = cifs_write(file->private_data, page_data + offset,
+                                copied, &pos);
                /* if (rc < 0) should we set writebehind rc? */
                kunmap(page);
@@ -1648,28 +1523,47 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        return rc;
 }
-int cifs_fsync(struct file *file, int datasync)
+int cifs_strict_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
        struct cifsTconInfo *tcon;
        struct cifsFileInfo *smbfile = file->private_data;
        struct inode *inode = file->f_path.dentry->d_inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        xid = GetXid();
        cFYI(1, "Sync file - name: %s datasync: 0x%x",
                file->f_path.dentry->d_name.name, datasync);
-        rc = filemap_write_and_wait(inode->i_mapping);
+        if (!CIFS_I(inode)->clientCanCacheRead)
-        if (rc == 0) {
+                cifs_invalidate_mapping(inode);
-                rc = CIFS_I(inode)->write_behind_rc;
-                CIFS_I(inode)->write_behind_rc = 0;
+        tcon = tlink_tcon(smbfile->tlink);
-                tcon = CIFS_SB(inode->i_sb)->tcon;
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
-                if (!rc && tcon && smbfile &&
+                rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
-                   !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
-                        rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
+        FreeXid(xid);
-        }
+        return rc;
+}
+int cifs_fsync(struct file *file, int datasync)
+{
+        int xid;
+        int rc = 0;
+        struct cifsTconInfo *tcon;
+        struct cifsFileInfo *smbfile = file->private_data;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        xid = GetXid();
+        cFYI(1, "Sync file - name: %s datasync: 0x%x",
+                file->f_path.dentry->d_name.name, datasync);
+        tcon = tlink_tcon(smbfile->tlink);
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+                rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
        FreeXid(xid);
        return rc;
@@ -1712,92 +1606,278 @@ int cifs_flush(struct file *file, fl_owner_t id)
        struct inode *inode = file->f_path.dentry->d_inode;
        int rc = 0;
-        /* Rather than do the steps manually:
+        if (file->f_mode & FMODE_WRITE)
-           lock the inode for writing
+                rc = filemap_write_and_wait(inode->i_mapping);
-           loop through pages looking for write behind data (dirty pages)
-           coalesce into contiguous 16K (or smaller) chunks to write to server
-           send to server (prefer in parallel)
-           deal with writebehind errors
-           unlock inode for writing
-           filemapfdatawrite appears easier for the time being */
-        rc = filemap_fdatawrite(inode->i_mapping);
-        /* reset wb rc if we were able to write out dirty pages */
-        if (!rc) {
-                rc = CIFS_I(inode)->write_behind_rc;
-                CIFS_I(inode)->write_behind_rc = 0;
-        }
        cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
        return rc;
 }
-ssize_t cifs_user_read(struct file *file, char __user *read_data,
+static int
-        size_t read_size, loff_t *poffset)
+cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
 {
-        int rc = -EACCES;
+        int rc = 0;
+        unsigned long i;
+        for (i = 0; i < num_pages; i++) {
+                pages[i] = alloc_page(__GFP_HIGHMEM);
+                if (!pages[i]) {
+                        /*
+                         * save number of pages we have already allocated and
+                         * return with ENOMEM error
+                         */
+                        num_pages = i;
+                        rc = -ENOMEM;
+                        goto error;
+                }
+        }
+        return rc;
+error:
+        for (i = 0; i < num_pages; i++)
+                put_page(pages[i]);
+        return rc;
+}
+static inline
+size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
+{
+        size_t num_pages;
+        size_t clen;
+        clen = min_t(const size_t, len, wsize);
+        num_pages = clen / PAGE_CACHE_SIZE;
+        if (clen % PAGE_CACHE_SIZE)
+                num_pages++;
+        if (cur_len)
+                *cur_len = clen;
+        return num_pages;
+}
+static ssize_t
+cifs_iovec_write(struct file *file, const struct iovec *iov,
+                 unsigned long nr_segs, loff_t *poffset)
+{
+        unsigned int written;
+        unsigned long num_pages, npages, i;
+        size_t copied, len, cur_len;
+        ssize_t total_written = 0;
+        struct kvec *to_send;
+        struct page **pages;
+        struct iov_iter it;
+        struct inode *inode;
+        struct cifsFileInfo *open_file;
+        struct cifsTconInfo *pTcon;
+        struct cifs_sb_info *cifs_sb;
+        int xid, rc;
+        len = iov_length(iov, nr_segs);
+        if (!len)
+                return 0;
+        rc = generic_write_checks(file, poffset, &len, 0);
+        if (rc)
+                return rc;
+        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+        pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
+        if (!pages)
+                return -ENOMEM;
+        to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
+        if (!to_send) {
+                kfree(pages);
+                return -ENOMEM;
+        }
+        rc = cifs_write_allocate_pages(pages, num_pages);
+        if (rc) {
+                kfree(pages);
+                kfree(to_send);
+                return rc;
+        }
+        xid = GetXid();
+        open_file = file->private_data;
+        pTcon = tlink_tcon(open_file->tlink);
+        inode = file->f_path.dentry->d_inode;
+        iov_iter_init(&it, iov, nr_segs, len, 0);
+        npages = num_pages;
+        do {
+                size_t save_len = cur_len;
+                for (i = 0; i < npages; i++) {
+                        copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
+                        copied = iov_iter_copy_from_user(pages[i], &it, 0,
+                                                         copied);
+                        cur_len -= copied;
+                        iov_iter_advance(&it, copied);
+                        to_send[i+1].iov_base = kmap(pages[i]);
+                        to_send[i+1].iov_len = copied;
+                }
+                cur_len = save_len - cur_len;
+                do {
+                        if (open_file->invalidHandle) {
+                                rc = cifs_reopen_file(open_file, false);
+                                if (rc != 0)
+                                        break;
+                        }
+                        rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
+                                           cur_len, *poffset, &written,
+                                           to_send, npages, 0);
+                } while (rc == -EAGAIN);
+                for (i = 0; i < npages; i++)
+                        kunmap(pages[i]);
+                if (written) {
+                        len -= written;
+                        total_written += written;
+                        cifs_update_eof(CIFS_I(inode), *poffset, written);
+                        *poffset += written;
+                } else if (rc < 0) {
+                        if (!total_written)
+                                total_written = rc;
+                        break;
+                }
+                /* get length and number of kvecs of the next write */
+                npages = get_numpages(cifs_sb->wsize, len, &cur_len);
+        } while (len > 0);
+        if (total_written > 0) {
+                spin_lock(&inode->i_lock);
+                if (*poffset > inode->i_size)
+                        i_size_write(inode, *poffset);
+                spin_unlock(&inode->i_lock);
+        }
+        cifs_stats_bytes_written(pTcon, total_written);
+        mark_inode_dirty_sync(inode);
+        for (i = 0; i < num_pages; i++)
+                put_page(pages[i]);
+        kfree(to_send);
+        kfree(pages);
+        FreeXid(xid);
+        return total_written;
+}
+static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+                                unsigned long nr_segs, loff_t pos)
+{
+        ssize_t written;
+        struct inode *inode;
+        inode = iocb->ki_filp->f_path.dentry->d_inode;
+        /*
+         * BB - optimize the way when signing is disabled. We can drop this
+         * extra memory-to-memory copying and use iovec buffers for constructing
+         * write request.
+         */
+        written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+        if (written > 0) {
+                CIFS_I(inode)->invalid_mapping = true;
+                iocb->ki_pos = pos;
+        }
+        return written;
+}
+ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+                           unsigned long nr_segs, loff_t pos)
+{
+        struct inode *inode;
+        inode = iocb->ki_filp->f_path.dentry->d_inode;
+        if (CIFS_I(inode)->clientCanCacheAll)
+                return generic_file_aio_write(iocb, iov, nr_segs, pos);
+        /*
+         * In strict cache mode we need to write the data to the server exactly
+         * from the pos to pos+len-1 rather than flush all affected pages
+         * because it may cause a error with mandatory locks on these pages but
+         * not on the region from pos to ppos+len-1.
+         */
+        return cifs_user_writev(iocb, iov, nr_segs, pos);
+}
+static ssize_t
+cifs_iovec_read(struct file *file, const struct iovec *iov,
+                 unsigned long nr_segs, loff_t *poffset)
+{
+        int rc;
+        int xid;
+        ssize_t total_read;
        unsigned int bytes_read = 0;
-        unsigned int total_read = 0;
+        size_t len, cur_len;
-        unsigned int current_read_size;
+        int iov_offset = 0;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        int xid;
        struct cifsFileInfo *open_file;
-        char *smb_read_data;
-        char __user *current_offset;
        struct smb_com_read_rsp *pSMBr;
+        char *read_data;
+        if (!nr_segs)
+                return 0;
+        len = iov_length(iov, nr_segs);
+        if (!len)
+                return 0;
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
-        if (file->private_data == NULL) {
-                rc = -EBADF;
-                FreeXid(xid);
-                return rc;
-        }
        open_file = file->private_data;
+        pTcon = tlink_tcon(open_file->tlink);
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
-        for (total_read = 0, current_offset = read_data;
+        for (total_read = 0; total_read < len; total_read += bytes_read) {
-             read_size > total_read;
+                cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
-             total_read += bytes_read, current_offset += bytes_read) {
-                current_read_size = min_t(const int, read_size - total_read,
-                                          cifs_sb->rsize);
                rc = -EAGAIN;
-                smb_read_data = NULL;
+                read_data = NULL;
                while (rc == -EAGAIN) {
                        int buf_type = CIFS_NO_BUFFER;
-                        if ((open_file->invalidHandle) &&
+                        if (open_file->invalidHandle) {
-                            (!open_file->closePend)) {
+                                rc = cifs_reopen_file(open_file, true);
-                                rc = cifs_reopen_file(file, true);
                                if (rc != 0)
                                        break;
                        }
-                        rc = CIFSSMBRead(xid, pTcon,
+                        rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
-                                         open_file->netfid,
+                                         cur_len, *poffset, &bytes_read,
-                                         current_read_size, *poffset,
+                                         &read_data, &buf_type);
-                                         &bytes_read, &smb_read_data,
+                        pSMBr = (struct smb_com_read_rsp *)read_data;
-                                         &buf_type);
+                        if (read_data) {
-                        pSMBr = (struct smb_com_read_rsp *)smb_read_data;
+                                char *data_offset = read_data + 4 +
-                        if (smb_read_data) {
+                                                le16_to_cpu(pSMBr->DataOffset);
-                                if (copy_to_user(current_offset,
+                                if (memcpy_toiovecend(iov, data_offset,
-                                                smb_read_data +
+                                                      iov_offset, bytes_read))
-                                                4 /* RFC1001 length field */ +
-                                                le16_to_cpu(pSMBr->DataOffset),
-                                                bytes_read))
                                        rc = -EFAULT;
                                if (buf_type == CIFS_SMALL_BUFFER)
-                                        cifs_small_buf_release(smb_read_data);
+                                        cifs_small_buf_release(read_data);
                                else if (buf_type == CIFS_LARGE_BUFFER)
-                                        cifs_buf_release(smb_read_data);
+                                        cifs_buf_release(read_data);
-                                smb_read_data = NULL;
+                                read_data = NULL;
+                                iov_offset += bytes_read;
                        }
                }
                if (rc || (bytes_read == 0)) {
                        if (total_read) {
                                break;
@@ -1810,13 +1890,57 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                        *poffset += bytes_read;
                }
        }
        FreeXid(xid);
        return total_read;
 }
+ssize_t cifs_user_read(struct file *file, char __user *read_data,
+                       size_t read_size, loff_t *poffset)
+{
+        struct iovec iov;
+        iov.iov_base = read_data;
+        iov.iov_len = read_size;
+        return cifs_iovec_read(file, &iov, 1, poffset);
+}
+static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t pos)
+{
+        ssize_t read;
+        read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
+        if (read > 0)
+                iocb->ki_pos = pos;
+        return read;
+}
+ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+                          unsigned long nr_segs, loff_t pos)
+{
+        struct inode *inode;
+        inode = iocb->ki_filp->f_path.dentry->d_inode;
+        if (CIFS_I(inode)->clientCanCacheRead)
+                return generic_file_aio_read(iocb, iov, nr_segs, pos);
+        /*
+         * In strict cache mode we need to read from the server all the time
+         * if we don't have level II oplock because the server can delay mtime
+         * change - so we can't make a decision about inode invalidating.
+         * And we can also fail with pagereading if there are mandatory locks
+         * on pages affected by this read but not on the region from pos to
+         * pos+len-1.
+         */
+        return cifs_user_readv(iocb, iov, nr_segs, pos);
+}
 static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
-        loff_t *poffset)
+                         loff_t *poffset)
 {
        int rc = -EACCES;
        unsigned int bytes_read = 0;
@@ -1831,7 +1955,6 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
        if (file->private_data == NULL) {
                rc = -EBADF;
@@ -1839,6 +1962,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
                return rc;
        }
        open_file = file->private_data;
+        pTcon = tlink_tcon(open_file->tlink);
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
@@ -1857,9 +1981,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
                }
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
-                        if ((open_file->invalidHandle) &&
+                        if (open_file->invalidHandle) {
-                            (!open_file->closePend)) {
+                                rc = cifs_reopen_file(open_file, true);
-                                rc = cifs_reopen_file(file, true);
                                if (rc != 0)
                                        break;
                        }
@@ -1885,6 +2008,21 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        return total_read;
 }
+int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int rc, xid;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        xid = GetXid();
+        if (!CIFS_I(inode)->clientCanCacheRead)
+                cifs_invalidate_mapping(inode);
+        rc = generic_file_mmap(file, vma);
+        FreeXid(xid);
+        return rc;
+}
 int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        int rc, xid;
@@ -1974,7 +2112,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        }
        open_file = file->private_data;
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
+        pTcon = tlink_tcon(open_file->tlink);
        /*
         * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2022,9 +2160,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                                read_size, contig_pages);
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
-                        if ((open_file->invalidHandle) &&
+                        if (open_file->invalidHandle) {
-                            (!open_file->closePend)) {
+                                rc = cifs_reopen_file(open_file, true);
-                                rc = cifs_reopen_file(file, true);
                                if (rc != 0)
                                        break;
                        }
@@ -2173,18 +2310,14 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
 {
        struct cifsFileInfo *open_file;
-        read_lock(&GlobalSMBSeslock);
+        spin_lock(&cifs_file_list_lock);
        list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-                if (open_file->closePend)
+                if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
-                        continue;
+                        spin_unlock(&cifs_file_list_lock);
-                if (open_file->pfile &&
-                    ((open_file->pfile->f_flags & O_RDWR) ||
-                     (open_file->pfile->f_flags & O_WRONLY))) {
-                        read_unlock(&GlobalSMBSeslock);
                        return 1;
                }
        }
-        read_unlock(&GlobalSMBSeslock);
+        spin_unlock(&cifs_file_list_lock);
        return 0;
 }
@@ -2310,10 +2443,9 @@ void cifs_oplock_break(struct work_struct *work)
 {
        struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
                                                  oplock_break);
-        struct inode *inode = cfile->pInode;
+        struct inode *inode = cfile->dentry->d_inode;
        struct cifsInodeInfo *cinode = CIFS_I(inode);
-        struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb);
+        int rc = 0;
-        int rc, waitrc = 0;
        if (inode && S_ISREG(inode->i_mode)) {
                if (cinode->clientCanCacheRead)
@@ -2322,13 +2454,10 @@ void cifs_oplock_break(struct work_struct *work)
                        break_lease(inode, O_WRONLY);
                rc = filemap_fdatawrite(inode->i_mapping);
                if (cinode->clientCanCacheRead == 0) {
-                        waitrc = filemap_fdatawait(inode->i_mapping);
+                        rc = filemap_fdatawait(inode->i_mapping);
+                        mapping_set_error(inode->i_mapping, rc);
                        invalidate_remote_inode(inode);
                }
-                if (!rc)
-                        rc = waitrc;
-                if (rc)
-                        cinode->write_behind_rc = rc;
                cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
        }
@@ -2338,33 +2467,37 @@ void cifs_oplock_break(struct work_struct *work)
         * not bother sending an oplock release if session to server still is
         * disconnected since oplock already released by the server
         */
-        if (!cfile->closePend && !cfile->oplock_break_cancelled) {
+        if (!cfile->oplock_break_cancelled) {
-                rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
+                rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
-                                 LOCKING_ANDX_OPLOCK_RELEASE, false);
+                                 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+                                 cinode->clientCanCacheRead ? 1 : 0);
                cFYI(1, "Oplock release rc = %d", rc);
        }
        /*
         * We might have kicked in before is_valid_oplock_break()
         * finished grabbing reference for us.  Make sure it's done by
-         * waiting for GlobalSMSSeslock.
+         * waiting for cifs_file_list_lock.
         */
-        write_lock(&GlobalSMBSeslock);
+        spin_lock(&cifs_file_list_lock);
-        write_unlock(&GlobalSMBSeslock);
+        spin_unlock(&cifs_file_list_lock);
        cifs_oplock_break_put(cfile);
 }
+/* must be called while holding cifs_file_list_lock */
 void cifs_oplock_break_get(struct cifsFileInfo *cfile)
 {
-        mntget(cfile->mnt);
+        cifs_sb_active(cfile->dentry->d_sb);
        cifsFileInfo_get(cfile);
 }
 void cifs_oplock_break_put(struct cifsFileInfo *cfile)
 {
-        mntput(cfile->mnt);
+        struct super_block *sb = cfile->dentry->d_sb;
        cifsFileInfo_put(cfile);
+        cifs_sb_deactive(sb);
 }
 const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 9f3f5c4be161..297a43d0ff7f 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -2,7 +2,7 @@
 *   fs/cifs/fscache.c - CIFS filesystem cache interface
 *
 *   Copyright (c) 2010 Novell, Inc.
- *   Author(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *   Author(s): Suresh Jayaraman <sjayaraman@suse.de>
 *
 *   This library is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU Lesser General Public License as published
@@ -62,15 +62,17 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 {
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        if (cifsi->fscache)
                return;
-        cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
-                                &cifs_fscache_inode_object_def,
+                cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
-                                cifsi);
+                                &cifs_fscache_inode_object_def, cifsi);
-        cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)",
+                cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
-                        cifs_sb->tcon->fscache, cifsi->fscache);
+                                cifsi->fscache);
+        }
 }
 void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
 {
        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
                cifs_fscache_disable_inode_cookie(inode);
-        else {
+        else
                cifs_fscache_enable_inode_cookie(inode);
-                cFYI(1, "CIFS: fscache inode cookie set");
-        }
 }
 void cifs_fscache_reset_inode_cookie(struct inode *inode)
@@ -117,7 +117,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
                /* retire the current fscache cache and get a new one */
                fscache_relinquish_cookie(cifsi->fscache, 1);
-                cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+                cifsi->fscache = fscache_acquire_cookie(
+                                        cifs_sb_master_tcon(cifs_sb)->fscache,
                                        &cifs_fscache_inode_object_def,
                                        cifsi);
                cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 53cce8cc2224..8852470b4fbb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,7 +32,7 @@
 #include "fscache.h"
-static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
+static void cifs_set_ops(struct inode *inode)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -44,15 +44,19 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
                                inode->i_fop = &cifs_file_direct_nobrl_ops;
                        else
                                inode->i_fop = &cifs_file_direct_ops;
+                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                                inode->i_fop = &cifs_file_strict_nobrl_ops;
+                        else
+                                inode->i_fop = &cifs_file_strict_ops;
                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
                        inode->i_fop = &cifs_file_nobrl_ops;
                else { /* not direct, send byte range locks */
                        inode->i_fop = &cifs_file_ops;
                }
                /* check if server can support readpages */
-                if (cifs_sb->tcon->ses->server->maxBuf <
+                if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
                                PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
                        inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
                else
@@ -60,7 +64,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
                break;
        case S_IFDIR:
 #ifdef CONFIG_CIFS_DFS_UPCALL
-                if (is_dfs_referral) {
+                if (IS_AUTOMOUNT(inode)) {
                        inode->i_op = &cifs_dfs_referral_inode_operations;
                } else {
 #else /* NO DFS support, treat as a directory */
@@ -167,7 +171,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        }
        spin_unlock(&inode->i_lock);
-        cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
+        if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
+                inode->i_flags |= S_AUTOMOUNT;
+        cifs_set_ops(inode);
 }
 void
@@ -288,8 +294,8 @@ int cifs_get_file_info_unix(struct file *filp)
        struct cifs_fattr fattr;
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
        struct cifsFileInfo *cfile = filp->private_data;
+        struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
        xid = GetXid();
        rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -313,15 +319,21 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        FILE_UNIX_BASIC_INFO find_data;
        struct cifs_fattr fattr;
        struct cifsTconInfo *tcon;
+        struct tcon_link *tlink;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        tcon = cifs_sb->tcon;
        cFYI(1, "Getting info on %s", full_path);
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        /* could have done a find first instead but this returns more info */
        rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
                                  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+        cifs_put_tlink(tlink);
        if (!rc) {
                cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
@@ -332,6 +344,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                return rc;
        }
+        /* check for Minshall+French symlinks */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+                int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+                if (tmprc)
+                        cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+        }
        if (*pinode == NULL) {
                /* get new inode */
                cifs_fill_uniqueid(sb, &fattr);
@@ -353,7 +372,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
        int rc;
        int oplock = 0;
        __u16 netfid;
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *tcon;
        char buf[24];
        unsigned int bytes_read;
        char *pbuf;
@@ -372,7 +392,12 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                return -EINVAL;  /* EOPNOTSUPP? */
        }
-        rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
+        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
                         CREATE_NOT_DIR, &netfid, &oplock, NULL,
                         cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags &
@@ -380,7 +405,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
        if (rc == 0) {
                int buf_type = CIFS_NO_BUFFER;
                        /* Read header */
-                rc = CIFSSMBRead(xid, pTcon, netfid,
+                rc = CIFSSMBRead(xid, tcon, netfid,
                                 24 /* length */, 0 /* offset */,
                                 &bytes_read, &pbuf, &buf_type);
                if ((rc == 0) && (bytes_read >= 8)) {
@@ -422,8 +447,9 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                        fattr->cf_dtype = DT_REG;
                        rc = -EOPNOTSUPP; /* or some unknown SFU type */
                }
-                CIFSSMBClose(xid, pTcon, netfid);
+                CIFSSMBClose(xid, tcon, netfid);
        }
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -441,11 +467,19 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
        ssize_t rc;
        char ea_value[4];
        __u32 mode;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *tcon;
-        rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
+        rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
                            ea_value, 4 /* size of buf */, cifs_sb->local_nls,
                            cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        cifs_put_tlink(tlink);
        if (rc < 0)
                return (int)rc;
        else if (rc > 3) {
@@ -468,6 +502,8 @@ static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
                       struct cifs_sb_info *cifs_sb, bool adjust_tz)
 {
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        memset(fattr, 0, sizeof(*fattr));
        fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
        if (info->DeletePending)
@@ -482,12 +518,13 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
        if (adjust_tz) {
-                fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
+                fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
-                fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
+                fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
        }
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        fattr->cf_createtime = le64_to_cpu(info->CreationTime);
        if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
                fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
@@ -515,8 +552,8 @@ int cifs_get_file_info(struct file *filp)
        struct cifs_fattr fattr;
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
        struct cifsFileInfo *cfile = filp->private_data;
+        struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
        xid = GetXid();
        rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -554,26 +591,33 @@ int cifs_get_inode_info(struct inode **pinode,
 {
        int rc = 0, tmprc;
        struct cifsTconInfo *pTcon;
+        struct tcon_link *tlink;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        char *buf = NULL;
        bool adjustTZ = false;
        struct cifs_fattr fattr;
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
        cFYI(1, "Getting info on %s", full_path);
        if ((pfindData == NULL) && (*pinode != NULL)) {
                if (CIFS_I(*pinode)->clientCanCacheRead) {
                        cFYI(1, "No need to revalidate cached inode sizes");
-                        return rc;
+                        goto cgii_exit;
                }
        }
        /* if file info not passed in then get it from server */
        if (pfindData == NULL) {
                buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-                if (buf == NULL)
+                if (buf == NULL) {
-                        return -ENOMEM;
+                        rc = -ENOMEM;
+                        goto cgii_exit;
+                }
                pfindData = (FILE_ALL_INFO *)buf;
                /* could do find first instead but this returns more info */
@@ -649,18 +693,30 @@ int cifs_get_inode_info(struct inode **pinode,
                        cFYI(1, "cifs_sfu_type failed: %d", tmprc);
        }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                cFYI(1, "Getting mode bits from ACL");
+                rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
-                cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
+                                                pfid);
+                if (rc) {
+                        cFYI(1, "%s: Getting ACL failed with error: %d",
+                                __func__, rc);
+                        goto cgii_exit;
+                }
        }
-#endif
+#endif /* CONFIG_CIFS_ACL */
        /* fill in remaining high mode bits e.g. SUID, VTX */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
                cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
+        /* check for Minshall+French symlinks */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+                tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+                if (tmprc)
+                        cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+        }
        if (!*pinode) {
                *pinode = cifs_iget(sb, &fattr);
                if (!*pinode)
@@ -671,6 +727,7 @@ int cifs_get_inode_info(struct inode **pinode,
 cgii_exit:
        kfree(buf);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -678,7 +735,8 @@ static const struct inode_operations cifs_ipc_inode_ops = {
        .lookup = cifs_lookup,
 };
-char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+                                struct cifsTconInfo *tcon)
 {
        int pplen = cifs_sb->prepathlen;
        int dfsplen;
@@ -692,8 +750,8 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
                return full_path;
        }
-        if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
+        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
-                dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
        else
                dfsplen = 0;
@@ -702,7 +760,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
                return full_path;
        if (dfsplen) {
-                strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+                strncpy(full_path, tcon->treeName, dfsplen);
                /* switch slash direction in prepath depending on whether
                 * windows or posix style path names
                 */
@@ -728,6 +786,10 @@ cifs_find_inode(struct inode *inode, void *opaque)
        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
                return 0;
+        /* use createtime like an i_generation field */
+        if (CIFS_I(inode)->createtime != fattr->cf_createtime)
+                return 0;
        /* don't match inode of different type */
        if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
                return 0;
@@ -745,6 +807,7 @@ cifs_init_inode(struct inode *inode, void *opaque)
        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
        CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
+        CIFS_I(inode)->createtime = fattr->cf_createtime;
        return 0;
 }
@@ -758,14 +821,14 @@ inode_has_hashed_dentries(struct inode *inode)
 {
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
                if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        return true;
                }
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        return false;
 }
@@ -818,32 +881,34 @@ retry_iget5_locked:
 struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
        int xid;
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct inode *inode = NULL;
        long rc;
        char *full_path;
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-        cifs_sb = CIFS_SB(sb);
+        full_path = cifs_build_path_to_root(cifs_sb, tcon);
-        full_path = cifs_build_path_to_root(cifs_sb);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
        xid = GetXid();
-        if (cifs_sb->tcon->unix_ext)
+        if (tcon->unix_ext)
                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
        else
                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
                                                xid, NULL);
-        if (!inode)
+        if (!inode) {
-                return ERR_PTR(rc);
+                inode = ERR_PTR(rc);
+                goto out;
+        }
 #ifdef CONFIG_CIFS_FSCACHE
        /* populate tcon->resource_id */
-        cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid;
+        tcon->resource_id = CIFS_I(inode)->uniqueid;
 #endif
-        if (rc && cifs_sb->tcon->ipc) {
+        if (rc && tcon->ipc) {
                cFYI(1, "ipc connection - fake read inode");
                inode->i_mode |= S_IFDIR;
                inode->i_nlink = 2;
@@ -852,13 +917,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                inode->i_uid = cifs_sb->mnt_uid;
                inode->i_gid = cifs_sb->mnt_gid;
        } else if (rc) {
-                kfree(full_path);
-                _FreeXid(xid);
                iget_failed(inode);
-                return ERR_PTR(rc);
+                inode = ERR_PTR(rc);
        }
+out:
        kfree(full_path);
        /* can not call macro FreeXid here since in a void func
         * TODO: This is no longer true
@@ -879,7 +942,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
        struct cifsFileInfo *open_file;
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct tcon_link *tlink = NULL;
+        struct cifsTconInfo *pTcon;
        FILE_BASIC_INFO info_buf;
        if (attrs == NULL)
@@ -918,13 +982,22 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
        /*
         * If the file is already open for write, just use that fileid
         */
-        open_file = find_writable_file(cifsInode);
+        open_file = find_writable_file(cifsInode, true);
        if (open_file) {
                netfid = open_file->netfid;
                netpid = open_file->pid;
+                pTcon = tlink_tcon(open_file->tlink);
                goto set_via_filehandle;
        }
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                rc = PTR_ERR(tlink);
+                tlink = NULL;
+                goto out;
+        }
+        pTcon = tlink_tcon(tlink);
        /*
         * NT4 apparently returns success on this call, but it doesn't
         * really work.
@@ -968,6 +1041,8 @@ set_via_filehandle:
        else
                cifsFileInfo_put(open_file);
 out:
+        if (tlink != NULL)
+                cifs_put_tlink(tlink);
        return rc;
 }
@@ -985,10 +1060,16 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
        struct inode *inode = dentry->d_inode;
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *tcon;
        __u32 dosattr, origattr;
        FILE_BASIC_INFO *info_buf = NULL;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
                         DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
                         &netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -1057,6 +1138,7 @@ out_close:
        CIFSSMBClose(xid, tcon, netfid);
 out:
        kfree(info_buf);
+        cifs_put_tlink(tlink);
        return rc;
        /*
@@ -1096,12 +1178,18 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
        struct cifsInodeInfo *cifs_inode;
        struct super_block *sb = dir->i_sb;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *tcon;
        struct iattr *attrs = NULL;
        __u32 dosattr = 0, origattr = 0;
        cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        xid = GetXid();
        /* Unlink can be called from rename so we can not take the
@@ -1109,8 +1197,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
        full_path = build_path_from_dentry(dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto unlink_out;
-                return rc;
        }
        if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1176,10 +1263,11 @@ out_reval:
        dir->i_ctime = dir->i_mtime = current_fs_time(sb);
        cifs_inode = CIFS_I(dir);
        CIFS_I(dir)->time = 0;  /* force revalidate of dir as well */
+unlink_out:
        kfree(full_path);
        kfree(attrs);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -1188,6 +1276,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        int rc = 0, tmprc;
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
@@ -1195,16 +1284,18 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
-        xid = GetXid();
        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto mkdir_out;
-                return rc;
        }
        if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1239,10 +1330,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 /*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
        to set uid/gid */
                        inc_nlink(inode);
-                        if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
-                        else
-                                direntry->d_op = &cifs_dentry_ops;
                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
                        cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1283,10 +1370,6 @@ mkdir_get_info:
                        rc = cifs_get_inode_info(&newinode, full_path, NULL,
                                                 inode->i_sb, xid, NULL);
-                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
-                else
-                        direntry->d_op = &cifs_dentry_ops;
                d_instantiate(direntry, newinode);
                 /* setting nlink not necessary except in cases where we
                  * failed to get it from the server or was set bogus */
@@ -1362,6 +1445,7 @@ mkdir_get_info:
 mkdir_out:
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -1370,6 +1454,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        int rc = 0;
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct cifsInodeInfo *cifsInode;
@@ -1378,18 +1463,23 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        xid = GetXid();
-        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto rmdir_exit;
-                return rc;
        }
+        cifs_sb = CIFS_SB(inode->i_sb);
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                rc = PTR_ERR(tlink);
+                goto rmdir_exit;
+        }
+        pTcon = tlink_tcon(tlink);
        rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
                          cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        cifs_put_tlink(tlink);
        if (!rc) {
                drop_nlink(inode);
@@ -1410,6 +1500,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
                current_fs_time(inode->i_sb);
+rmdir_exit:
        kfree(full_path);
        FreeXid(xid);
        return rc;
@@ -1420,10 +1511,16 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
                struct dentry *to_dentry, const char *toPath)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *pTcon;
        __u16 srcfid;
        int oplock, rc;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
        /* try path-based rename first */
        rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
                           cifs_sb->mnt_cifs_flags &
@@ -1435,11 +1532,11 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
         * rename by filehandle to various Windows servers.
         */
        if (rc == 0 || rc != -ETXTBSY)
-                return rc;
+                goto do_rename_exit;
        /* open-file renames don't work across directories */
        if (to_dentry->d_parent != from_dentry->d_parent)
-                return rc;
+                goto do_rename_exit;
        /* open the file to be renamed -- we need DELETE perms */
        rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
@@ -1455,7 +1552,8 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
                CIFSSMBClose(xid, pTcon, srcfid);
        }
+do_rename_exit:
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -1465,13 +1563,17 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
        char *fromName = NULL;
        char *toName = NULL;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *tcon;
        FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
        FILE_UNIX_BASIC_INFO *info_buf_target;
        int xid, rc, tmprc;
        cifs_sb = CIFS_SB(source_dir->i_sb);
-        tcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        xid = GetXid();
@@ -1547,6 +1649,7 @@ cifs_rename_exit:
        kfree(fromName);
        kfree(toName);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -1554,6 +1657,7 @@ static bool
 cifs_inode_needs_reval(struct inode *inode)
 {
        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        if (cifs_i->clientCanCacheRead)
                return false;
@@ -1564,20 +1668,22 @@ cifs_inode_needs_reval(struct inode *inode)
        if (cifs_i->time == 0)
                return true;
-        /* FIXME: the actimeo should be tunable */
+        if (!time_in_range(jiffies, cifs_i->time,
-        if (time_after_eq(jiffies, cifs_i->time + HZ))
+                                cifs_i->time + cifs_sb->actimeo))
                return true;
        /* hardlinked files w/ noserverino get "special" treatment */
-        if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
            S_ISREG(inode->i_mode) && inode->i_nlink != 1)
                return true;
        return false;
 }
-/* check invalid_mapping flag and zap the cache if it's set */
+/*
-static void
+ * Zap the cache. Called when invalid_mapping flag is set.
+ */
+void
 cifs_invalidate_mapping(struct inode *inode)
 {
        int rc;
@@ -1588,8 +1694,7 @@ cifs_invalidate_mapping(struct inode *inode)
        /* write back any cached data */
        if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
                rc = filemap_write_and_wait(inode->i_mapping);
-                if (rc)
+                mapping_set_error(inode->i_mapping, rc);
-                        cifs_i->write_behind_rc = rc;
        }
        invalidate_remote_inode(inode);
        cifs_fscache_reset_inode_cookie(inode);
@@ -1599,11 +1704,12 @@ int cifs_revalidate_file(struct file *filp)
 {
        int rc = 0;
        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
        if (!cifs_inode_needs_reval(inode))
                goto check_inval;
-        if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
+        if (tlink_tcon(cfile->tlink)->unix_ext)
                rc = cifs_get_file_info_unix(filp);
        else
                rc = cifs_get_file_info(filp);
@@ -1644,7 +1750,7 @@ int cifs_revalidate_dentry(struct dentry *dentry)
                 "jiffies %ld", full_path, inode, inode->i_count.counter,
                 dentry, dentry->d_time, jiffies);
-        if (CIFS_SB(sb)->tcon->unix_ext)
+        if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
        else
                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -1660,13 +1766,29 @@ check_inval:
 }
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-        struct kstat *stat)
+                 struct kstat *stat)
 {
+        struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        int err = cifs_revalidate_dentry(dentry);
        if (!err) {
                generic_fillattr(dentry->d_inode, stat);
                stat->blksize = CIFS_MAX_MSGSIZE;
                stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
+                /*
+                 * If on a multiuser mount without unix extensions, and the
+                 * admin hasn't overridden them, set the ownership to the
+                 * fsuid/fsgid of the current process.
+                 */
+                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+                    !tcon->unix_ext) {
+                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
+                                stat->uid = current_fsuid();
+                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
+                                stat->gid = current_fsgid();
+                }
        }
        return err;
 }
@@ -1708,7 +1830,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
        struct cifsFileInfo *open_file;
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct tcon_link *tlink = NULL;
+        struct cifsTconInfo *pTcon = NULL;
        /*
         * To avoid spurious oplock breaks from server, in the case of
@@ -1719,10 +1842,11 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
         * writebehind data than the SMB timeout for the SetPathInfo
         * request would allow
         */
-        open_file = find_writable_file(cifsInode);
+        open_file = find_writable_file(cifsInode, true);
        if (open_file) {
                __u16 nfid = open_file->netfid;
                __u32 npid = open_file->pid;
+                pTcon = tlink_tcon(open_file->tlink);
                rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
                                        npid, false);
                cifsFileInfo_put(open_file);
@@ -1737,6 +1861,13 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                rc = -EINVAL;
        if (rc != 0) {
+                if (pTcon == NULL) {
+                        tlink = cifs_sb_tlink(cifs_sb);
+                        if (IS_ERR(tlink))
+                                return PTR_ERR(tlink);
+                        pTcon = tlink_tcon(tlink);
+                }
                /* Set file size by pathname rather than by handle
                   either because no valid, writeable file handle for
                   it was found or because there was an error setting
@@ -1766,6 +1897,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                                CIFSSMBClose(xid, pTcon, netfid);
                        }
                }
+                if (tlink)
+                        cifs_put_tlink(tlink);
        }
        if (rc == 0) {
@@ -1786,7 +1919,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
        struct inode *inode = direntry->d_inode;
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *pTcon;
        struct cifs_unix_set_info_args *args = NULL;
        struct cifsFileInfo *open_file;
@@ -1820,10 +1954,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
         * the flush returns error?
         */
        rc = filemap_write_and_wait(inode->i_mapping);
-        if (rc != 0) {
+        mapping_set_error(inode->i_mapping, rc);
-                cifsInode->write_behind_rc = rc;
+        rc = 0;
-                rc = 0;
-        }
        if (attrs->ia_valid & ATTR_SIZE) {
                rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -1873,17 +2005,25 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
                args->ctime = NO_CHANGE_64;
        args->device = 0;
-        open_file = find_writable_file(cifsInode);
+        open_file = find_writable_file(cifsInode, true);
        if (open_file) {
                u16 nfid = open_file->netfid;
                u32 npid = open_file->pid;
+                pTcon = tlink_tcon(open_file->tlink);
                rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
                cifsFileInfo_put(open_file);
        } else {
+                tlink = cifs_sb_tlink(cifs_sb);
+                if (IS_ERR(tlink)) {
+                        rc = PTR_ERR(tlink);
+                        goto out;
+                }
+                pTcon = tlink_tcon(tlink);
                rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
                                    cifs_sb->local_nls,
                                    cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                cifs_put_tlink(tlink);
        }
        if (rc)
@@ -1956,10 +2096,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
         * the flush returns error?
         */
        rc = filemap_write_and_wait(inode->i_mapping);
-        if (rc != 0) {
+        mapping_set_error(inode->i_mapping, rc);
-                cifsInode->write_behind_rc = rc;
+        rc = 0;
-                rc = 0;
-        }
        if (attrs->ia_valid & ATTR_SIZE) {
                rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -1988,11 +2126,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        if (attrs->ia_valid & ATTR_MODE) {
                rc = 0;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                        rc = mode_to_acl(inode, full_path, mode);
+                        rc = mode_to_cifs_acl(inode, full_path, mode);
-                else
+                        if (rc) {
-#endif
+                                cFYI(1, "%s: Setting ACL failed with error: %d",
+                                        __func__, rc);
+                                goto cifs_setattr_exit;
+                        }
+                } else
+#endif /* CONFIG_CIFS_ACL */
                if (((mode & S_IWUGO) == 0) &&
                    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
@@ -2051,7 +2194,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        setattr_copy(inode, attrs);
        mark_inode_dirty(inode);
-        return 0;
 cifs_setattr_exit:
        kfree(full_path);
@@ -2064,7 +2206,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
        struct inode *inode = direntry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
        if (pTcon->unix_ext)
                return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 9d38a71c8e14..0c98672d0122 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -37,11 +37,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        int xid;
        struct cifs_sb_info *cifs_sb;
 #ifdef CONFIG_CIFS_POSIX
+        struct cifsFileInfo *pSMBFile = filep->private_data;
+        struct cifsTconInfo *tcon;
        __u64   ExtAttrBits = 0;
        __u64   ExtAttrMask = 0;
        __u64   caps;
-        struct cifsTconInfo *tcon;
-        struct cifsFileInfo *pSMBFile = filep->private_data;
 #endif /* CONFIG_CIFS_POSIX */
        xid = GetXid();
@@ -50,17 +50,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        cifs_sb = CIFS_SB(inode->i_sb);
-#ifdef CONFIG_CIFS_POSIX
-        tcon = cifs_sb->tcon;
-        if (tcon)
-                caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
-        else {
-                rc = -EIO;
-                FreeXid(xid);
-                return -EIO;
-        }
-#endif /* CONFIG_CIFS_POSIX */
        switch (command) {
                case CIFS_IOC_CHECKUMOUNT:
                        cFYI(1, "User unmount attempted");
@@ -73,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        break;
 #ifdef CONFIG_CIFS_POSIX
                case FS_IOC_GETFLAGS:
+                        if (pSMBFile == NULL)
+                                break;
+                        tcon = tlink_tcon(pSMBFile->tlink);
+                        caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
                        if (CIFS_UNIX_EXTATTR_CAP & caps) {
-                                if (pSMBFile == NULL)
-                                        break;
                                rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
                                        &ExtAttrBits, &ExtAttrMask);
                                if (rc == 0)
@@ -86,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        break;
                case FS_IOC_SETFLAGS:
+                        if (pSMBFile == NULL)
+                                break;
+                        tcon = tlink_tcon(pSMBFile->tlink);
+                        caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
                        if (CIFS_UNIX_EXTATTR_CAP & caps) {
                                if (get_user(ExtAttrBits, (int __user *)arg)) {
                                        rc = -EFAULT;
                                        break;
                                }
-                                if (pSMBFile == NULL)
-                                        break;
                                /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
                                        extAttrBits, &ExtAttrMask);*/
                        }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 473ca8033656..e8804d373404 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -29,6 +29,337 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
+#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
+#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
+#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024)
+#define CIFS_MF_SYMLINK_FILE_SIZE \
+        (CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
+#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
+#define CIFS_MF_SYMLINK_MD5_FORMAT \
+        "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
+#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
+        md5_hash[0],  md5_hash[1],  md5_hash[2],  md5_hash[3], \
+        md5_hash[4],  md5_hash[5],  md5_hash[6],  md5_hash[7], \
+        md5_hash[8],  md5_hash[9],  md5_hash[10], md5_hash[11],\
+        md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
+static int
+symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
+{
+        int rc;
+        unsigned int size;
+        struct crypto_shash *md5;
+        struct sdesc *sdescmd5;
+        md5 = crypto_alloc_shash("md5", 0, 0);
+        if (IS_ERR(md5)) {
+                rc = PTR_ERR(md5);
+                cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
+                return rc;
+        }
+        size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
+        sdescmd5 = kmalloc(size, GFP_KERNEL);
+        if (!sdescmd5) {
+                rc = -ENOMEM;
+                cERROR(1, "%s: Memory allocation failure\n", __func__);
+                goto symlink_hash_err;
+        }
+        sdescmd5->shash.tfm = md5;
+        sdescmd5->shash.flags = 0x0;
+        rc = crypto_shash_init(&sdescmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Could not init md5 shash\n", __func__);
+                goto symlink_hash_err;
+        }
+        crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+        rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+symlink_hash_err:
+        crypto_free_shash(md5);
+        kfree(sdescmd5);
+        return rc;
+}
+static int
+CIFSParseMFSymlink(const u8 *buf,
+                   unsigned int buf_len,
+                   unsigned int *_link_len,
+                   char **_link_str)
+{
+        int rc;
+        unsigned int link_len;
+        const char *md5_str1;
+        const char *link_str;
+        u8 md5_hash[16];
+        char md5_str2[34];
+        if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+                return -EINVAL;
+        md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET];
+        link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET];
+        rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len);
+        if (rc != 1)
+                return -EINVAL;
+        rc = symlink_hash(link_len, link_str, md5_hash);
+        if (rc) {
+                cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+                return rc;
+        }
+        snprintf(md5_str2, sizeof(md5_str2),
+                 CIFS_MF_SYMLINK_MD5_FORMAT,
+                 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+        if (strncmp(md5_str1, md5_str2, 17) != 0)
+                return -EINVAL;
+        if (_link_str) {
+                *_link_str = kstrndup(link_str, link_len, GFP_KERNEL);
+                if (!*_link_str)
+                        return -ENOMEM;
+        }
+        *_link_len = link_len;
+        return 0;
+}
+static int
+CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
+{
+        int rc;
+        unsigned int link_len;
+        unsigned int ofs;
+        u8 md5_hash[16];
+        if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+                return -EINVAL;
+        link_len = strlen(link_str);
+        if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+                return -ENAMETOOLONG;
+        rc = symlink_hash(link_len, link_str, md5_hash);
+        if (rc) {
+                cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+                return rc;
+        }
+        snprintf(buf, buf_len,
+                 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+                 link_len,
+                 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+        ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
+        memcpy(buf + ofs, link_str, link_len);
+        ofs += link_len;
+        if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+                buf[ofs] = '\n';
+                ofs++;
+        }
+        while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+                buf[ofs] = ' ';
+                ofs++;
+        }
+        return 0;
+}
+static int
+CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
+                    const char *fromName, const char *toName,
+                    const struct nls_table *nls_codepage, int remap)
+{
+        int rc;
+        int oplock = 0;
+        __u16 netfid = 0;
+        u8 *buf;
+        unsigned int bytes_written = 0;
+        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
+        if (rc != 0) {
+                kfree(buf);
+                return rc;
+        }
+        rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
+                         CREATE_NOT_DIR, &netfid, &oplock, NULL,
+                         nls_codepage, remap);
+        if (rc != 0) {
+                kfree(buf);
+                return rc;
+        }
+        rc = CIFSSMBWrite(xid, tcon, netfid,
+                          CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+                          0 /* offset */,
+                          &bytes_written, buf, NULL, 0);
+        CIFSSMBClose(xid, tcon, netfid);
+        kfree(buf);
+        if (rc != 0)
+                return rc;
+        if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
+                return -EIO;
+        return 0;
+}
+static int
+CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
+                   const unsigned char *searchName, char **symlinkinfo,
+                   const struct nls_table *nls_codepage, int remap)
+{
+        int rc;
+        int oplock = 0;
+        __u16 netfid = 0;
+        u8 *buf;
+        char *pbuf;
+        unsigned int bytes_read = 0;
+        int buf_type = CIFS_NO_BUFFER;
+        unsigned int link_len = 0;
+        FILE_ALL_INFO file_info;
+        rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
+                         CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+                         nls_codepage, remap);
+        if (rc != 0)
+                return rc;
+        if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+                CIFSSMBClose(xid, tcon, netfid);
+                /* it's not a symlink */
+                return -EINVAL;
+        }
+        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        pbuf = buf;
+        rc = CIFSSMBRead(xid, tcon, netfid,
+                         CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+                         0 /* offset */,
+                         &bytes_read, &pbuf, &buf_type);
+        CIFSSMBClose(xid, tcon, netfid);
+        if (rc != 0) {
+                kfree(buf);
+                return rc;
+        }
+        rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
+        kfree(buf);
+        if (rc != 0)
+                return rc;
+        return 0;
+}
+bool
+CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
+{
+        if (!(fattr->cf_mode & S_IFREG))
+                /* it's not a symlink */
+                return false;
+        if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+                /* it's not a symlink */
+                return false;
+        return true;
+}
+int
+CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+                   const unsigned char *path,
+                   struct cifs_sb_info *cifs_sb, int xid)
+{
+        int rc;
+        int oplock = 0;
+        __u16 netfid = 0;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *pTcon;
+        u8 *buf;
+        char *pbuf;
+        unsigned int bytes_read = 0;
+        int buf_type = CIFS_NO_BUFFER;
+        unsigned int link_len = 0;
+        FILE_ALL_INFO file_info;
+        if (!CIFSCouldBeMFSymlink(fattr))
+                /* it's not a symlink */
+                return 0;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+                         CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+                         cifs_sb->local_nls,
+                         cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc != 0)
+                goto out;
+        if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+                CIFSSMBClose(xid, pTcon, netfid);
+                /* it's not a symlink */
+                goto out;
+        }
+        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+        if (!buf) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        pbuf = buf;
+        rc = CIFSSMBRead(xid, pTcon, netfid,
+                         CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+                         0 /* offset */,
+                         &bytes_read, &pbuf, &buf_type);
+        CIFSSMBClose(xid, pTcon, netfid);
+        if (rc != 0) {
+                kfree(buf);
+                goto out;
+        }
+        rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
+        kfree(buf);
+        if (rc == -EINVAL) {
+                /* it's not a symlink */
+                rc = 0;
+                goto out;
+        }
+        if (rc != 0)
+                goto out;
+        /* it is a symlink */
+        fattr->cf_eof = link_len;
+        fattr->cf_mode &= ~S_IFMT;
+        fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
+        fattr->cf_dtype = DT_LNK;
+out:
+        cifs_put_tlink(tlink);
+        return rc;
+}
 int
 cifs_hardlink(struct dentry *old_file, struct inode *inode,
              struct dentry *direntry)
@@ -37,17 +368,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
        int xid;
        char *fromName = NULL;
        char *toName = NULL;
-        struct cifs_sb_info *cifs_sb_target;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct cifsInodeInfo *cifsInode;
-        xid = GetXid();
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
-        cifs_sb_target = CIFS_SB(inode->i_sb);
+                return PTR_ERR(tlink);
-        pTcon = cifs_sb_target->tcon;
+        pTcon = tlink_tcon(tlink);
-/* No need to check for cross device links since server will do that
+        xid = GetXid();
-   BB note DFS case in future though (when we may have to check) */
        fromName = build_path_from_dentry(old_file);
        toName = build_path_from_dentry(direntry);
@@ -56,16 +387,15 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
                goto cifs_hl_exit;
        }
-/*      if (cifs_sb_target->tcon->ses->capabilities & CAP_UNIX)*/
        if (pTcon->unix_ext)
                rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
-                                            cifs_sb_target->local_nls,
+                                            cifs_sb->local_nls,
-                                            cifs_sb_target->mnt_cifs_flags &
+                                            cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        else {
                rc = CIFSCreateHardLink(xid, pTcon, fromName, toName,
-                                        cifs_sb_target->local_nls,
+                                        cifs_sb->local_nls,
-                                        cifs_sb_target->mnt_cifs_flags &
+                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                if ((rc == -EIO) || (rc == -EINVAL))
                        rc = -EOPNOTSUPP;
@@ -101,6 +431,7 @@ cifs_hl_exit:
        kfree(fromName);
        kfree(toName);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -113,10 +444,19 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
        char *full_path = NULL;
        char *target_path = NULL;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct tcon_link *tlink = NULL;
+        struct cifsTconInfo *tcon;
        xid = GetXid();
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                rc = PTR_ERR(tlink);
+                tlink = NULL;
+                goto out;
+        }
+        tcon = tlink_tcon(tlink);
        /*
         * For now, we just handle symlinks with unix extensions enabled.
         * Eventually we should handle NTFS reparse points, and MacOS
@@ -130,7 +470,8 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
         * but there doesn't seem to be any harm in allowing the client to
         * read them.
         */
-        if (!(tcon->ses->capabilities & CAP_UNIX)) {
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+            && !(tcon->ses->capabilities & CAP_UNIX)) {
                rc = -EACCES;
                goto out;
        }
@@ -141,8 +482,21 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
        cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
-        rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
+        rc = -EACCES;
-                                     cifs_sb->local_nls);
+        /*
+         * First try Minshall+French Symlinks, if configured
+         * and fallback to UNIX Extensions Symlinks.
+         */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+                rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
+                                        cifs_sb->local_nls,
+                                        cifs_sb->mnt_cifs_flags &
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX))
+                rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
+                                             cifs_sb->local_nls);
        kfree(full_path);
 out:
        if (rc != 0) {
@@ -151,6 +505,8 @@ out:
        }
        FreeXid(xid);
+        if (tlink)
+                cifs_put_tlink(tlink);
        nd_set_link(nd, target_path);
        return NULL;
 }
@@ -160,29 +516,37 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 {
        int rc = -EOPNOTSUPP;
        int xid;
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
        xid = GetXid();
-        cifs_sb = CIFS_SB(inode->i_sb);
+        tlink = cifs_sb_tlink(cifs_sb);
-        pTcon = cifs_sb->tcon;
+        if (IS_ERR(tlink)) {
+                rc = PTR_ERR(tlink);
+                goto symlink_exit;
+        }
+        pTcon = tlink_tcon(tlink);
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto symlink_exit;
-                return rc;
        }
        cFYI(1, "Full path: %s", full_path);
        cFYI(1, "symname is %s", symname);
        /* BB what if DFS and this volume is on different share? BB */
-        if (pTcon->unix_ext)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+                rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
+                                         cifs_sb->local_nls,
+                                         cifs_sb->mnt_cifs_flags &
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        else if (pTcon->unix_ext)
                rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
                                           cifs_sb->local_nls);
        /* else
@@ -201,15 +565,12 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                        cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
                              rc);
                } else {
-                        if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
-                        else
-                                direntry->d_op = &cifs_dentry_ops;
                        d_instantiate(direntry, newinode);
                }
        }
+symlink_exit:
        kfree(full_path);
+        cifs_put_tlink(tlink);
        FreeXid(xid);
        return rc;
 }
diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c
deleted file mode 100644
index a725c2609d67..000000000000
--- a/fs/cifs/md4.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
-   Unix SMB/Netbios implementation.
-   Version 1.9.
-   a implementation of MD4 designed for use in the SMB authentication protocol
-   Copyright (C) Andrew Tridgell 1997-1998.
-   Modified by Steve French (sfrench@us.ibm.com) 2002-2003
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include "cifsencrypt.h"
-/* NOTE: This code makes no attempt to be fast! */
-static __u32
-F(__u32 X, __u32 Y, __u32 Z)
-{
-        return (X & Y) | ((~X) & Z);
-}
-static __u32
-G(__u32 X, __u32 Y, __u32 Z)
-{
-        return (X & Y) | (X & Z) | (Y & Z);
-}
-static __u32
-H(__u32 X, __u32 Y, __u32 Z)
-{
-        return X ^ Y ^ Z;
-}
-static __u32
-lshift(__u32 x, int s)
-{
-        x &= 0xFFFFFFFF;
-        return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
-}
-#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s)
-#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s)
-#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s)
-/* this applies md4 to 64 byte chunks */
-static void
-mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D)
-{
-        int j;
-        __u32 AA, BB, CC, DD;
-        __u32 X[16];
-        for (j = 0; j < 16; j++)
-                X[j] = M[j];
-        AA = *A;
-        BB = *B;
-        CC = *C;
-        DD = *D;
-        ROUND1(A, B, C, D, 0, 3);
-        ROUND1(D, A, B, C, 1, 7);
-        ROUND1(C, D, A, B, 2, 11);
-        ROUND1(B, C, D, A, 3, 19);
-        ROUND1(A, B, C, D, 4, 3);
-        ROUND1(D, A, B, C, 5, 7);
-        ROUND1(C, D, A, B, 6, 11);
-        ROUND1(B, C, D, A, 7, 19);
-        ROUND1(A, B, C, D, 8, 3);
-        ROUND1(D, A, B, C, 9, 7);
-        ROUND1(C, D, A, B, 10, 11);
-        ROUND1(B, C, D, A, 11, 19);
-        ROUND1(A, B, C, D, 12, 3);
-        ROUND1(D, A, B, C, 13, 7);
-        ROUND1(C, D, A, B, 14, 11);
-        ROUND1(B, C, D, A, 15, 19);
-        ROUND2(A, B, C, D, 0, 3);
-        ROUND2(D, A, B, C, 4, 5);
-        ROUND2(C, D, A, B, 8, 9);
-        ROUND2(B, C, D, A, 12, 13);
-        ROUND2(A, B, C, D, 1, 3);
-        ROUND2(D, A, B, C, 5, 5);
-        ROUND2(C, D, A, B, 9, 9);
-        ROUND2(B, C, D, A, 13, 13);
-        ROUND2(A, B, C, D, 2, 3);
-        ROUND2(D, A, B, C, 6, 5);
-        ROUND2(C, D, A, B, 10, 9);
-        ROUND2(B, C, D, A, 14, 13);
-        ROUND2(A, B, C, D, 3, 3);
-        ROUND2(D, A, B, C, 7, 5);
-        ROUND2(C, D, A, B, 11, 9);
-        ROUND2(B, C, D, A, 15, 13);
-        ROUND3(A, B, C, D, 0, 3);
-        ROUND3(D, A, B, C, 8, 9);
-        ROUND3(C, D, A, B, 4, 11);
-        ROUND3(B, C, D, A, 12, 15);
-        ROUND3(A, B, C, D, 2, 3);
-        ROUND3(D, A, B, C, 10, 9);
-        ROUND3(C, D, A, B, 6, 11);
-        ROUND3(B, C, D, A, 14, 15);
-        ROUND3(A, B, C, D, 1, 3);
-        ROUND3(D, A, B, C, 9, 9);
-        ROUND3(C, D, A, B, 5, 11);
-        ROUND3(B, C, D, A, 13, 15);
-        ROUND3(A, B, C, D, 3, 3);
-        ROUND3(D, A, B, C, 11, 9);
-        ROUND3(C, D, A, B, 7, 11);
-        ROUND3(B, C, D, A, 15, 15);
-        *A += AA;
-        *B += BB;
-        *C += CC;
-        *D += DD;
-        *A &= 0xFFFFFFFF;
-        *B &= 0xFFFFFFFF;
-        *C &= 0xFFFFFFFF;
-        *D &= 0xFFFFFFFF;
-        for (j = 0; j < 16; j++)
-                X[j] = 0;
-}
-static void
-copy64(__u32 *M, unsigned char *in)
-{
-        int i;
-        for (i = 0; i < 16; i++)
-                M[i] = (in[i * 4 + 3] << 24) | (in[i * 4 + 2] << 16) |
-                    (in[i * 4 + 1] << 8) | (in[i * 4 + 0] << 0);
-}
-static void
-copy4(unsigned char *out, __u32 x)
-{
-        out[0] = x & 0xFF;
-        out[1] = (x >> 8) & 0xFF;
-        out[2] = (x >> 16) & 0xFF;
-        out[3] = (x >> 24) & 0xFF;
-}
-/* produce a md4 message digest from data of length n bytes */
-void
-mdfour(unsigned char *out, unsigned char *in, int n)
-{
-        unsigned char buf[128];
-        __u32 M[16];
-        __u32 b = n * 8;
-        int i;
-        __u32 A = 0x67452301;
-        __u32 B = 0xefcdab89;
-        __u32 C = 0x98badcfe;
-        __u32 D = 0x10325476;
-        while (n > 64) {
-                copy64(M, in);
-                mdfour64(M, &A, &B, &C, &D);
-                in += 64;
-                n -= 64;
-        }
-        for (i = 0; i < 128; i++)
-                buf[i] = 0;
-        memcpy(buf, in, n);
-        buf[n] = 0x80;
-        if (n <= 55) {
-                copy4(buf + 56, b);
-                copy64(M, buf);
-                mdfour64(M, &A, &B, &C, &D);
-        } else {
-                copy4(buf + 120, b);
-                copy64(M, buf);
-                mdfour64(M, &A, &B, &C, &D);
-                copy64(M, buf + 64);
-                mdfour64(M, &A, &B, &C, &D);
-        }
-        for (i = 0; i < 128; i++)
-                buf[i] = 0;
-        copy64(M, buf);
-        copy4(out, A);
-        copy4(out + 4, B);
-        copy4(out + 8, C);
-        copy4(out + 12, D);
-        A = B = C = D = 0;
-}
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
deleted file mode 100644
index 98b66a54c319..000000000000
--- a/fs/cifs/md5.c
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * This code implements the MD5 message-digest algorithm.
- * The algorithm is due to Ron Rivest.  This code was
- * written by Colin Plumb in 1993, no copyright is claimed.
- * This code is in the public domain; do with it what you wish.
- *
- * Equivalent code is available from RSA Data Security, Inc.
- * This code has been tested against that, and is equivalent,
- * except that you don't need to include two pages of legalese
- * with every copy.
- *
- * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
- * needed on buffers full of bytes, and then call cifs_MD5_final, which
- * will fill a supplied 16-byte array with the digest.
- */
-/* This code slightly modified to fit into Samba by
-   abartlet@samba.org Jun 2001
-   and to fit the cifs vfs by
-   Steve French sfrench@us.ibm.com */
-#include <linux/string.h>
-#include "md5.h"
-static void MD5Transform(__u32 buf[4], __u32 const in[16]);
-/*
- * Note: this code is harmless on little-endian machines.
- */
-static void
-byteReverse(unsigned char *buf, unsigned longs)
-{
-        __u32 t;
-        do {
-                t = (__u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
-                    ((unsigned) buf[1] << 8 | buf[0]);
-                *(__u32 *) buf = t;
-                buf += 4;
-        } while (--longs);
-}
-/*
- * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
- * initialization constants.
- */
-void
-cifs_MD5_init(struct MD5Context *ctx)
-{
-        ctx->buf[0] = 0x67452301;
-        ctx->buf[1] = 0xefcdab89;
-        ctx->buf[2] = 0x98badcfe;
-        ctx->buf[3] = 0x10325476;
-        ctx->bits[0] = 0;
-        ctx->bits[1] = 0;
-}
-/*
- * Update context to reflect the concatenation of another buffer full
- * of bytes.
- */
-void
-cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
-{
-        register __u32 t;
-        /* Update bitcount */
-        t = ctx->bits[0];
-        if ((ctx->bits[0] = t + ((__u32) len << 3)) < t)
-                ctx->bits[1]++; /* Carry from low to high */
-        ctx->bits[1] += len >> 29;
-        t = (t >> 3) & 0x3f;    /* Bytes already in shsInfo->data */
-        /* Handle any leading odd-sized chunks */
-        if (t) {
-                unsigned char *p = (unsigned char *) ctx->in + t;
-                t = 64 - t;
-                if (len < t) {
-                        memmove(p, buf, len);
-                        return;
-                }
-                memmove(p, buf, t);
-                byteReverse(ctx->in, 16);
-                MD5Transform(ctx->buf, (__u32 *) ctx->in);
-                buf += t;
-                len -= t;
-        }
-        /* Process data in 64-byte chunks */
-        while (len >= 64) {
-                memmove(ctx->in, buf, 64);
-                byteReverse(ctx->in, 16);
-                MD5Transform(ctx->buf, (__u32 *) ctx->in);
-                buf += 64;
-                len -= 64;
-        }
-        /* Handle any remaining bytes of data. */
-        memmove(ctx->in, buf, len);
-}
-/*
- * Final wrapup - pad to 64-byte boundary with the bit pattern
- * 1 0* (64-bit count of bits processed, MSB-first)
- */
-void
-cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
-{
-        unsigned int count;
-        unsigned char *p;
-        /* Compute number of bytes mod 64 */
-        count = (ctx->bits[0] >> 3) & 0x3F;
-        /* Set the first char of padding to 0x80.  This is safe since there is
-           always at least one byte free */
-        p = ctx->in + count;
-        *p++ = 0x80;
-        /* Bytes of padding needed to make 64 bytes */
-        count = 64 - 1 - count;
-        /* Pad out to 56 mod 64 */
-        if (count < 8) {
-                /* Two lots of padding:  Pad the first block to 64 bytes */
-                memset(p, 0, count);
-                byteReverse(ctx->in, 16);
-                MD5Transform(ctx->buf, (__u32 *) ctx->in);
-                /* Now fill the next block with 56 bytes */
-                memset(ctx->in, 0, 56);
-        } else {
-                /* Pad block to 56 bytes */
-                memset(p, 0, count - 8);
-        }
-        byteReverse(ctx->in, 14);
-        /* Append length in bits and transform */
-        ((__u32 *) ctx->in)[14] = ctx->bits[0];
-        ((__u32 *) ctx->in)[15] = ctx->bits[1];
-        MD5Transform(ctx->buf, (__u32 *) ctx->in);
-        byteReverse((unsigned char *) ctx->buf, 4);
-        memmove(digest, ctx->buf, 16);
-        memset(ctx, 0, sizeof(*ctx));   /* In case it's sensitive */
-}
-/* The four core functions - F1 is optimized somewhat */
-/* #define F1(x, y, z) (x & y | ~x & z) */
-#define F1(x, y, z) (z ^ (x & (y ^ z)))
-#define F2(x, y, z) F1(z, x, y)
-#define F3(x, y, z) (x ^ y ^ z)
-#define F4(x, y, z) (y ^ (x | ~z))
-/* This is the central step in the MD5 algorithm. */
-#define MD5STEP(f, w, x, y, z, data, s) \
-        (w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x)
-/*
- * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data.  cifs_MD5_update blocks
- * the data and converts bytes into longwords for this routine.
- */
-static void
-MD5Transform(__u32 buf[4], __u32 const in[16])
-{
-        register __u32 a, b, c, d;
-        a = buf[0];
-        b = buf[1];
-        c = buf[2];
-        d = buf[3];
-        MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
-        MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
-        MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
-        MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
-        MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
-        MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
-        MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
-        MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
-        MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
-        MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
-        MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
-        MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
-        MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
-        MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
-        MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
-        MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
-        MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
-        MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
-        MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
-        MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
-        MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
-        MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
-        MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
-        MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
-        MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
-        MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
-        MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
-        MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
-        MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
-        MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
-        MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
-        MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
-        MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
-        MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
-        MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
-        MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
-        MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
-        MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
-        MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
-        MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
-        MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
-        MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
-        MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
-        MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
-        MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
-        MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
-        MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
-        MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
-        MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
-        MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
-        MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
-        MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
-        MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
-        MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
-        MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
-        MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
-        MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
-        MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
-        MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
-        MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
-        MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
-        MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
-        MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
-        MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
-        buf[0] += a;
-        buf[1] += b;
-        buf[2] += c;
-        buf[3] += d;
-}
-#if 0   /* currently unused */
-/***********************************************************************
- the rfc 2104 version of hmac_md5 initialisation.
-***********************************************************************/
-static void
-hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-                      struct HMACMD5Context *ctx)
-{
-        int i;
-        /* if key is longer than 64 bytes reset it to key=MD5(key) */
-        if (key_len > 64) {
-                unsigned char tk[16];
-                struct MD5Context tctx;
-                cifs_MD5_init(&tctx);
-                cifs_MD5_update(&tctx, key, key_len);
-                cifs_MD5_final(tk, &tctx);
-                key = tk;
-                key_len = 16;
-        }
-        /* start out by storing key in pads */
-        memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-        memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
-        memcpy(ctx->k_ipad, key, key_len);
-        memcpy(ctx->k_opad, key, key_len);
-        /* XOR key with ipad and opad values */
-        for (i = 0; i < 64; i++) {
-                ctx->k_ipad[i] ^= 0x36;
-                ctx->k_opad[i] ^= 0x5c;
-        }
-        cifs_MD5_init(&ctx->ctx);
-        cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
-}
-#endif
-/***********************************************************************
- the microsoft version of hmac_md5 initialisation.
-***********************************************************************/
-void
-hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
-                         struct HMACMD5Context *ctx)
-{
-        int i;
-        /* if key is longer than 64 bytes truncate it */
-        if (key_len > 64)
-                key_len = 64;
-        /* start out by storing key in pads */
-        memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-        memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
-        memcpy(ctx->k_ipad, key, key_len);
-        memcpy(ctx->k_opad, key, key_len);
-        /* XOR key with ipad and opad values */
-        for (i = 0; i < 64; i++) {
-                ctx->k_ipad[i] ^= 0x36;
-                ctx->k_opad[i] ^= 0x5c;
-        }
-        cifs_MD5_init(&ctx->ctx);
-        cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
-}
-/***********************************************************************
- update hmac_md5 "inner" buffer
-***********************************************************************/
-void
-hmac_md5_update(const unsigned char *text, int text_len,
-                struct HMACMD5Context *ctx)
-{
-        cifs_MD5_update(&ctx->ctx, text, text_len);     /* then text of datagram */
-}
-/***********************************************************************
- finish off hmac_md5 "inner" buffer and generate outer one.
-***********************************************************************/
-void
-hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
-{
-        struct MD5Context ctx_o;
-        cifs_MD5_final(digest, &ctx->ctx);
-        cifs_MD5_init(&ctx_o);
-        cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
-        cifs_MD5_update(&ctx_o, digest, 16);
-        cifs_MD5_final(digest, &ctx_o);
-}
-/***********************************************************
- single function to calculate an HMAC MD5 digest from data.
- use the microsoft hmacmd5 init method because the key is 16 bytes.
-************************************************************/
-#if 0 /* currently unused */
-static void
-hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-         unsigned char *digest)
-{
-        struct HMACMD5Context ctx;
-        hmac_md5_init_limK_to_64(key, 16, &ctx);
-        if (data_len != 0)
-                hmac_md5_update(data, data_len, &ctx);
-        hmac_md5_final(digest, &ctx);
-}
-#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
deleted file mode 100644
index 6fba8cb402fd..000000000000
--- a/fs/cifs/md5.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef MD5_H
-#define MD5_H
-#ifndef HEADER_MD5_H
-/* Try to avoid clashes with OpenSSL */
-#define HEADER_MD5_H
-#endif
-struct MD5Context {
-        __u32 buf[4];
-        __u32 bits[2];
-        unsigned char in[64];
-};
-#endif                          /* !MD5_H */
-#ifndef _HMAC_MD5_H
-struct HMACMD5Context {
-        struct MD5Context ctx;
-        unsigned char k_ipad[65];
-        unsigned char k_opad[65];
-};
-#endif                          /* _HMAC_MD5_H */
-void cifs_MD5_init(struct MD5Context *context);
-void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
-                        unsigned len);
-void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
-/* The following definitions come from lib/hmacmd5.c  */
-/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-                        struct HMACMD5Context *ctx);*/
-void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
-                        struct HMACMD5Context *ctx);
-void hmac_md5_update(const unsigned char *text, int text_len,
-                        struct HMACMD5Context *ctx);
-void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
-/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-                        unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3ccadc1326d6..2a930a752a78 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -236,10 +236,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
 {
        __u16 mid = 0;
        __u16 last_mid;
-        int   collision;
+        bool collision;
-        if (server == NULL)
-                return mid;
        spin_lock(&GlobalMid_Lock);
        last_mid = server->CurrentMid; /* we do not want to loop forever */
@@ -252,24 +249,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
        (and it would also have to have been a request that
         did not time out) */
        while (server->CurrentMid != last_mid) {
-                struct list_head *tmp;
                struct mid_q_entry *mid_entry;
+                unsigned int num_mids;
-                collision = 0;
+                collision = false;
                if (server->CurrentMid == 0)
                        server->CurrentMid++;
-                list_for_each(tmp, &server->pending_mid_q) {
+                num_mids = 0;
-                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
+                        ++num_mids;
-                        if ((mid_entry->mid == server->CurrentMid) &&
+                        if (mid_entry->mid == server->CurrentMid &&
-                            (mid_entry->midState == MID_REQUEST_SUBMITTED)) {
+                            mid_entry->midState == MID_REQUEST_SUBMITTED) {
                                /* This mid is in use, try a different one */
-                                collision = 1;
+                                collision = true;
                                break;
                        }
                }
-                if (collision == 0) {
+                /*
+                 * if we have more than 32k mids in the list, then something
+                 * is very wrong. Possibly a local user is trying to DoS the
+                 * box by issuing long-running calls and SIGKILL'ing them. If
+                 * we get to 2^16 mids then we're in big trouble as this
+                 * function could loop forever.
+                 *
+                 * Go ahead and assign out the mid in this situation, but force
+                 * an eventual reconnect to clean out the pending_mid_q.
+                 */
+                if (num_mids > 32768)
+                        server->tcpStatus = CifsNeedReconnect;
+                if (!collision) {
                        mid = server->CurrentMid;
                        break;
                }
@@ -347,7 +358,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                                if (current_fsuid() != treeCon->ses->linux_uid) {
                                        cFYI(1, "Multiuser mode and UID "
                                                 "did not match tcon uid");
-                                        read_lock(&cifs_tcp_ses_lock);
+                                        spin_lock(&cifs_tcp_ses_lock);
                                        list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
                                                ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
                                                if (ses->linux_uid == current_fsuid()) {
@@ -361,7 +372,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                                                        }
                                                }
                                        }
-                                        read_unlock(&cifs_tcp_ses_lock);
+                                        spin_unlock(&cifs_tcp_ses_lock);
                                }
                        }
                }
@@ -381,29 +392,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 }
 static int
-checkSMBhdr(struct smb_hdr *smb, __u16 mid)
+check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 {
-        /* Make sure that this really is an SMB, that it is a response,
+        /* does it have the right SMB "signature" ? */
-           and that the message ids match */
+        if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
-        if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) &&
+                cERROR(1, "Bad protocol string signature header 0x%x",
-                (mid == smb->Mid)) {
+                        *(unsigned int *)smb->Protocol);
-                if (smb->Flags & SMBFLG_RESPONSE)
+                return 1;
-                        return 0;
+        }
-                else {
-                /* only one valid case where server sends us request */
+        /* Make sure that message ids match */
-                        if (smb->Command == SMB_COM_LOCKING_ANDX)
+        if (mid != smb->Mid) {
-                                return 0;
+                cERROR(1, "Mids do not match. received=%u expected=%u",
-                        else
+                        smb->Mid, mid);
-                                cERROR(1, "Received Request not response");
+                return 1;
-                }
-        } else { /* bad signature or mid */
-                if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
-                        cERROR(1, "Bad protocol string signature header %x",
-                                *(unsigned int *) smb->Protocol);
-                if (mid != smb->Mid)
-                        cERROR(1, "Mids do not match");
        }
-        cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
+        /* if it's a response then accept */
+        if (smb->Flags & SMBFLG_RESPONSE)
+                return 0;
+        /* only one valid case where server sends us request */
+        if (smb->Command == SMB_COM_LOCKING_ANDX)
+                return 0;
+        cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
        return 1;
 }
@@ -448,7 +461,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                return 1;
        }
-        if (checkSMBhdr(smb, mid))
+        if (check_smb_hdr(smb, mid))
                return 1;
        clc_len = smbCalcSize_LE(smb);
@@ -465,25 +478,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                        if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
                                return 0; /* bcc wrapped */
                }
-                cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
+                cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
                                clc_len, 4 + len, smb->Mid);
-                /* Windows XP can return a few bytes too much, presumably
-                an illegal pad, at the end of byte range lock responses
+                if (4 + len < clc_len) {
-                so we allow for that three byte pad, as long as actual
+                        cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
-                received length is as long or longer than calculated length */
-                /* We have now had to extend this more, since there is a
-                case in which it needs to be bigger still to handle a
-                malformed response to transact2 findfirst from WinXP when
-                access denied is returned and thus bcc and wct are zero
-                but server says length is 0x21 bytes too long as if the server
-                forget to reset the smb rfc1001 length when it reset the
-                wct and bcc to minimum size and drop the t2 parms and data */
-                if ((4+len > clc_len) && (len <= clc_len + 512))
-                        return 0;
-                else {
-                        cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
                                        len, smb->Mid);
                        return 1;
+                } else if (len > clc_len + 512) {
+                        /*
+                         * Some servers (Windows XP in particular) send more
+                         * data than the lengths in the SMB packet would
+                         * indicate on certain calls (byte range locks and
+                         * trans2 find first calls in particular). While the
+                         * client can handle such a frame by ignoring the
+                         * trailing data, we choose limit the amount of extra
+                         * data to 512 bytes.
+                         */
+                        cERROR(1, "RFC1001 size %u more than 512 bytes larger "
+                                  "than SMB for mid=%u", len, smb->Mid);
+                        return 1;
                }
        }
        return 0;
@@ -551,7 +565,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                return false;
        /* look up tcon based on tid & uid */
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &srv->smb_ses_list) {
                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
                list_for_each(tmp1, &ses->tcon_list) {
@@ -560,51 +574,40 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                continue;
                        cifs_stats_inc(&tcon->num_oplock_brks);
-                        read_lock(&GlobalSMBSeslock);
+                        spin_lock(&cifs_file_list_lock);
                        list_for_each(tmp2, &tcon->openFileList) {
                                netfile = list_entry(tmp2, struct cifsFileInfo,
                                                     tlist);
                                if (pSMB->Fid != netfile->netfid)
                                        continue;
-                                /*
-                                 * don't do anything if file is about to be
-                                 * closed anyway.
-                                 */
-                                if (netfile->closePend) {
-                                        read_unlock(&GlobalSMBSeslock);
-                                        read_unlock(&cifs_tcp_ses_lock);
-                                        return true;
-                                }
                                cFYI(1, "file id match, oplock break");
-                                pCifsInode = CIFS_I(netfile->pInode);
+                                pCifsInode = CIFS_I(netfile->dentry->d_inode);
-                                pCifsInode->clientCanCacheAll = false;
-                                if (pSMB->OplockLevel == 0)
-                                        pCifsInode->clientCanCacheRead = false;
+                                cifs_set_oplock_level(pCifsInode,
+                                        pSMB->OplockLevel ? OPLOCK_READ : 0);
                                /*
                                 * cifs_oplock_break_put() can't be called
                                 * from here.  Get reference after queueing
                                 * succeeded.  cifs_oplock_break() will
-                                 * synchronize using GlobalSMSSeslock.
+                                 * synchronize using cifs_file_list_lock.
                                 */
                                if (queue_work(system_nrt_wq,
                                               &netfile->oplock_break))
                                        cifs_oplock_break_get(netfile);
                                netfile->oplock_break_cancelled = false;
-                                read_unlock(&GlobalSMBSeslock);
+                                spin_unlock(&cifs_file_list_lock);
-                                read_unlock(&cifs_tcp_ses_lock);
+                                spin_unlock(&cifs_tcp_ses_lock);
                                return true;
                        }
-                        read_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
-                        read_unlock(&cifs_tcp_ses_lock);
+                        spin_unlock(&cifs_tcp_ses_lock);
                        cFYI(1, "No matching file for oplock break");
                        return true;
                }
        }
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        cFYI(1, "Can not process oplock break for non-existent connection");
        return true;
 }
@@ -648,77 +651,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
        return;
 }
-/* Convert 16 bit Unicode pathname to wire format from string in current code
-   page.  Conversion may involve remapping up the seven characters that are
-   only legal in POSIX-like OS (if they are present in the string). Path
-   names are little endian 16 bit Unicode on the wire */
-int
-cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
-                 const struct nls_table *cp, int mapChars)
-{
-        int i, j, charlen;
-        int len_remaining = maxlen;
-        char src_char;
-        __u16 temp;
-        if (!mapChars)
-                return cifs_strtoUCS(target, source, PATH_MAX, cp);
-        for (i = 0, j = 0; i < maxlen; j++) {
-                src_char = source[i];
-                switch (src_char) {
-                        case 0:
-                                target[j] = 0;
-                                goto ctoUCS_out;
-                        case ':':
-                                target[j] = cpu_to_le16(UNI_COLON);
-                                break;
-                        case '*':
-                                target[j] = cpu_to_le16(UNI_ASTERIK);
-                                break;
-                        case '?':
-                                target[j] = cpu_to_le16(UNI_QUESTION);
-                                break;
-                        case '<':
-                                target[j] = cpu_to_le16(UNI_LESSTHAN);
-                                break;
-                        case '>':
-                                target[j] = cpu_to_le16(UNI_GRTRTHAN);
-                                break;
-                        case '|':
-                                target[j] = cpu_to_le16(UNI_PIPE);
-                                break;
-                        /* BB We can not handle remapping slash until
-                           all the calls to build_path_from_dentry
-                           are modified, as they use slash as separator BB */
-                        /* case '\\':
-                                target[j] = cpu_to_le16(UNI_SLASH);
-                                break;*/
-                        default:
-                                charlen = cp->char2uni(source+i,
-                                        len_remaining, &temp);
-                                /* if no match, use question mark, which
-                                at least in some cases servers as wild card */
-                                if (charlen < 1) {
-                                        target[j] = cpu_to_le16(0x003f);
-                                        charlen = 1;
-                                } else
-                                        target[j] = cpu_to_le16(temp);
-                                len_remaining -= charlen;
-                                /* character may take more than one byte in the
-                                   the source string, but will take exactly two
-                                   bytes in the target string */
-                                i += charlen;
-                                continue;
-                }
-                i++; /* move to next char in source string */
-                len_remaining--;
-        }
-ctoUCS_out:
-        return i;
-}
 void
 cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
@@ -729,6 +661,26 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
                           "properly. Hardlinks will not be recognized on this "
                           "mount. Consider mounting with the \"noserverino\" "
                           "option to silence this message.",
-                           cifs_sb->tcon->treeName);
+                           cifs_sb_master_tcon(cifs_sb)->treeName);
+        }
+}
+void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
+{
+        oplock &= 0xF;
+        if (oplock == OPLOCK_EXCLUSIVE) {
+                cinode->clientCanCacheAll = true;
+                cinode->clientCanCacheRead = true;
+                cFYI(1, "Exclusive Oplock granted on inode %p",
+                     &cinode->vfs_inode);
+        } else if (oplock == OPLOCK_READ) {
+                cinode->clientCanCacheAll = false;
+                cinode->clientCanCacheRead = true;
+                cFYI(1, "Level II Oplock granted on inode %p",
+                    &cinode->vfs_inode);
+        } else {
+                cinode->clientCanCacheAll = false;
+                cinode->clientCanCacheRead = false;
        }
 }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 9aad47a2d62f..8d9189f64477 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
        }
        /* else ERRHRD class errors or junk  - return EIO */
-        cFYI(1, "Mapping smb error code %d to POSIX err %d",
+        cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
-                 smberrcode, rc);
+                 le32_to_cpu(smb->Status.CifsError), rc);
        /* generic corrective action e.g. reconnect SMB session on
         * ERRbaduid could be added */
@@ -916,14 +916,14 @@ unsigned int
 smbCalcSize(struct smb_hdr *ptr)
 {
        return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-                2 /* size of the bcc field */ + BCC(ptr));
+                2 /* size of the bcc field */ + get_bcc(ptr));
 }
 unsigned int
 smbCalcSize_LE(struct smb_hdr *ptr)
 {
        return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-                2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr)));
+                2 /* size of the bcc field */ + get_bcc_le(ptr));
 }
 /* The following are taken from fs/ntfs/util.c */
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e75319..5d52e4a3b1ed 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,21 @@
 #define NTLMSSP_NEGOTIATE_KEY_XCH   0x40000000
 #define NTLMSSP_NEGOTIATE_56        0x80000000
+/* Define AV Pair Field IDs */
+enum av_field_type {
+        NTLMSSP_AV_EOL = 0,
+        NTLMSSP_AV_NB_COMPUTER_NAME,
+        NTLMSSP_AV_NB_DOMAIN_NAME,
+        NTLMSSP_AV_DNS_COMPUTER_NAME,
+        NTLMSSP_AV_DNS_DOMAIN_NAME,
+        NTLMSSP_AV_DNS_TREE_NAME,
+        NTLMSSP_AV_FLAGS,
+        NTLMSSP_AV_TIMESTAMP,
+        NTLMSSP_AV_RESTRICTION,
+        NTLMSSP_AV_TARGET_NAME,
+        NTLMSSP_AV_CHANNEL_BINDINGS
+};
 /* Although typedefs are not commonly used for structure definitions */
 /* in the Linux kernel, in this particular case they are useful      */
 /* to more closely match the standards document for NTLMSSP from     */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index d5e591fab475..f8e4cd2a7912 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -79,7 +79,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        cFYI(1, "For %s", name->name);
        if (parent->d_op && parent->d_op->d_hash)
-                parent->d_op->d_hash(parent, name);
+                parent->d_op->d_hash(parent, parent->d_inode, name);
        else
                name->hash = full_name_hash(name->name, name->len);
@@ -102,11 +102,6 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
                return NULL;
        }
-        if (CIFS_SB(sb)->tcon->nocase)
-                dentry->d_op = &cifs_ci_dentry_ops;
-        else
-                dentry->d_op = &cifs_dentry_ops;
        alias = d_materialise_unique(dentry, inode);
        if (alias != NULL) {
                dput(dentry);
@@ -160,6 +155,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
        fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        fattr->cf_createtime = le64_to_cpu(info->CreationTime);
        fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
        fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
@@ -171,7 +167,7 @@ static void
 cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
                       struct cifs_sb_info *cifs_sb)
 {
-        int offset = cifs_sb->tcon->ses->server->timeAdj;
+        int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj;
        memset(fattr, 0, sizeof(*fattr));
        fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
@@ -199,7 +195,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
        int len;
        int oplock = 0;
        int rc;
-        struct cifsTconInfo *ptcon = cifs_sb->tcon;
+        struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
        char *tmpbuffer;
        rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -223,34 +219,38 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 static int initiate_cifs_search(const int xid, struct file *file)
 {
        int rc = 0;
-        char *full_path;
+        char *full_path = NULL;
        struct cifsFileInfo *cifsFile;
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        struct tcon_link *tlink = NULL;
        struct cifsTconInfo *pTcon;
        if (file->private_data == NULL) {
-                file->private_data =
+                tlink = cifs_sb_tlink(cifs_sb);
-                        kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+                if (IS_ERR(tlink))
+                        return PTR_ERR(tlink);
+                cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+                if (cifsFile == NULL) {
+                        rc = -ENOMEM;
+                        goto error_exit;
+                }
+                file->private_data = cifsFile;
+                cifsFile->tlink = cifs_get_tlink(tlink);
+                pTcon = tlink_tcon(tlink);
+        } else {
+                cifsFile = file->private_data;
+                pTcon = tlink_tcon(cifsFile->tlink);
        }
-        if (file->private_data == NULL)
-                return -ENOMEM;
-        cifsFile = file->private_data;
        cifsFile->invalidHandle = true;
        cifsFile->srch_inf.endOfSearch = false;
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        if (cifs_sb == NULL)
-                return -EINVAL;
-        pTcon = cifs_sb->tcon;
-        if (pTcon == NULL)
-                return -EINVAL;
        full_path = build_path_from_dentry(file->f_path.dentry);
+        if (full_path == NULL) {
-        if (full_path == NULL)
+                rc = -ENOMEM;
-                return -ENOMEM;
+                goto error_exit;
+        }
        cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
@@ -283,7 +283,9 @@ ffirst_retry:
                cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
                goto ffirst_retry;
        }
+error_exit:
        kfree(full_path);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -525,14 +527,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
           (index_to_find < first_entry_in_buffer)) {
                /* close and restart search */
                cFYI(1, "search backing up - close and restart search");
-                write_lock(&GlobalSMBSeslock);
+                spin_lock(&cifs_file_list_lock);
                if (!cifsFile->srch_inf.endOfSearch &&
                    !cifsFile->invalidHandle) {
                        cifsFile->invalidHandle = true;
-                        write_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
                        CIFSFindClose(xid, pTcon, cifsFile->netfid);
                } else
-                        write_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
                if (cifsFile->srch_inf.ntwrk_buf_start) {
                        cFYI(1, "freeing SMB ff cache buf on search rewind");
                        if (cifsFile->srch_inf.smallBuf)
@@ -738,24 +740,21 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
                cifs_autodisable_serverino(cifs_sb);
        }
+        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
+            CIFSCouldBeMFSymlink(&fattr))
+                /*
+                 * trying to get the type and mode can be slow,
+                 * so just call those regular files for now, and mark
+                 * for reval
+                 */
+                fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
        tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
        rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
                     ino, fattr.cf_dtype);
-        /*
-         * we can not return filldir errors to the caller since they are
-         * "normal" when the stat blocksize is too small - we return remapped
-         * error instead
-         *
-         * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
-         * case already. Why should we be clobbering other errors from it?
-         */
-        if (rc) {
-                cFYI(1, "filldir rc = %d", rc);
-                rc = -EOVERFLOW;
-        }
        dput(tmp_dentry);
        return rc;
 }
@@ -765,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 {
        int rc = 0;
        int xid, i;
-        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
        struct cifsFileInfo *cifsFile = NULL;
        char *current_entry;
@@ -776,10 +774,16 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        xid = GetXid();
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        /*
-        pTcon = cifs_sb->tcon;
+         * Ensure FindFirst doesn't fail before doing filldir() for '.' and
-        if (pTcon == NULL)
+         * '..'. Otherwise we won't be able to notify VFS in case of failure.
-                return -EINVAL;
+         */
+        if (file->private_data == NULL) {
+                rc = initiate_cifs_search(xid, file);
+                cFYI(1, "initiate cifs search rc %d", rc);
+                if (rc)
+                        goto rddir2_exit;
+        }
        switch ((int) file->f_pos) {
        case 0:
@@ -805,14 +809,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        if after then keep searching till find it */
                if (file->private_data == NULL) {
-                        rc = initiate_cifs_search(xid, file);
-                        cFYI(1, "initiate cifs search rc %d", rc);
-                        if (rc) {
-                                FreeXid(xid);
-                                return rc;
-                        }
-                }
-                if (file->private_data == NULL) {
                        rc = -EINVAL;
                        FreeXid(xid);
                        return rc;
@@ -829,6 +825,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        CIFSFindClose(xid, pTcon, cifsFile->netfid);
                } */
+                pTcon = tlink_tcon(cifsFile->tlink);
                rc = find_cifs_entry(xid, pTcon, file,
                                &current_entry, &num_to_fill);
                if (rc) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5dd..1adc9625a344 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,9 +32,6 @@
 #include <linux/slab.h>
 #include "cifs_spnego.h"
-extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
-                         unsigned char *p24);
 /*
 * Checks if this is the first smb session to be reconnected after
 * the socket has been reestablished (so we know whether to use vc 0).
@@ -80,7 +77,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
        if (max_vcs < 2)
                max_vcs = 0xFFFF;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
                        goto get_vc_num_exit;  /* vcnum will be zero */
        for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
@@ -112,7 +109,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
                vcnum = i;
        ses->vcnum = vcnum;
 get_vc_num_exit:
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        return cpu_to_le16(vcnum);
 }
@@ -280,7 +277,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 }
 static void
-decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
+decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
                      const struct nls_table *nls_cp)
 {
        int len;
@@ -326,7 +323,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        return;
 }
-static int decode_ascii_ssetup(char **pbcc_area, int bleft,
+static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
                               struct cifsSesInfo *ses,
                               const struct nls_table *nls_cp)
 {
@@ -383,6 +380,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
                                    struct cifsSesInfo *ses)
 {
+        unsigned int tioffset; /* challenge message target info area */
+        unsigned int tilen; /* challenge message target info area length  */
        CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
        if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -399,16 +399,27 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
                return -EINVAL;
        }
-        memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+        memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
        /* BB we could decode pblob->NegotiateFlags; some may be useful */
        /* In particular we can examine sign flags */
        /* BB spec says that if AvId field of MsvAvTimestamp is populated then
                we must set the MIC field of the AUTHENTICATE_MESSAGE */
+        ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
+        tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
+        tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
+        if (tilen) {
+                ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
+                if (!ses->auth_key.response) {
+                        cERROR(1, "Challenge target info allocation failure");
+                        return -ENOMEM;
+                }
+                memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
+                ses->auth_key.len = tilen;
+        }
        return 0;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 /* BB Move to ntlmssp.c eventually */
 /* We do not malloc the blob, it is passed in pbuffer, because
@@ -419,20 +430,23 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
        NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
        __u32 flags;
+        memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
        sec_blob->MessageType = NtLmNegotiate;
        /* BB is NTLMV2 session security format easier to use here? */
        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        if (ses->server->secMode &
-           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
-        if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
+                if (!ses->server->session_estab)
-                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+                        flags |= NTLMSSP_NEGOTIATE_KEY_XCH |
+                                NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+        }
-        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->NegotiateFlags = cpu_to_le32(flags);
        sec_blob->WorkstationName.BufferOffset = 0;
        sec_blob->WorkstationName.Length = 0;
@@ -448,13 +462,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
   maximum possible size is fixed and small, making this approach cleaner.
   This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
+                                        u16 *buflen,
                                   struct cifsSesInfo *ses,
-                                   const struct nls_table *nls_cp, bool first)
+                                   const struct nls_table *nls_cp)
 {
+        int rc;
        AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
        __u32 flags;
        unsigned char *tmp;
-        char ntlm_session_key[CIFS_SESS_KEY_SIZE];
        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
        sec_blob->MessageType = NtLmAuthenticate;
@@ -462,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        flags = NTLMSSP_NEGOTIATE_56 |
                NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        if (ses->server->secMode &
           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -470,26 +485,27 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
        tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
-        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->NegotiateFlags = cpu_to_le32(flags);
        sec_blob->LmChallengeResponse.BufferOffset =
                                cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
        sec_blob->LmChallengeResponse.Length = 0;
        sec_blob->LmChallengeResponse.MaximumLength = 0;
-        /* calculate session key,  BB what about adding similar ntlmv2 path? */
-        SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
-        if (first)
-                cifs_calculate_mac_key(&ses->server->mac_signing_key,
-                                       ntlm_session_key, ses->password);
-        memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
        sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
-        sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+        rc = setup_ntlmv2_rsp(ses, nls_cp);
-        sec_blob->NtChallengeResponse.MaximumLength =
+        if (rc) {
-                                cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                cERROR(1, "Error %d during NTLMSSP authentication", rc);
+                goto setup_ntlmv2_ret;
+        }
+        memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                        ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+        tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
-        tmp += CIFS_SESS_KEY_SIZE;
+        sec_blob->NtChallengeResponse.Length =
+                        cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+        sec_blob->NtChallengeResponse.MaximumLength =
+                        cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
        if (ses->domainName == NULL) {
                sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +517,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
                                    MAX_USERNAME_SIZE, nls_cp);
                len *= 2; /* unicode is 2 bytes each */
-                len += 2; /* trailing null */
                sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->DomainName.Length = cpu_to_le16(len);
                sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +533,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
                                    MAX_USERNAME_SIZE, nls_cp);
                len *= 2; /* unicode is 2 bytes each */
-                len += 2; /* trailing null */
                sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->UserName.Length = cpu_to_le16(len);
                sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -530,35 +544,25 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->WorkstationName.MaximumLength = 0;
        tmp += 2;
-        sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+        if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
-        sec_blob->SessionKey.Length = 0;
+                (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
-        sec_blob->SessionKey.MaximumLength = 0;
+                        && !calc_seckey(ses)) {
-        return tmp - pbuffer;
+                memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
-}
+                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
+                sec_blob->SessionKey.MaximumLength =
-static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
+                                cpu_to_le16(CIFS_CPHTXT_SIZE);
-                                 struct cifsSesInfo *ses)
+                tmp += CIFS_CPHTXT_SIZE;
-{
+        } else {
-        build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
+                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+                sec_blob->SessionKey.Length = 0;
+                sec_blob->SessionKey.MaximumLength = 0;
-        return;
+        }
-}
-static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
-                                  struct cifsSesInfo *ses,
-                                  const struct nls_table *nls, bool first_time)
-{
-        int bloblen;
-        bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
-                                          first_time);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
-        return bloblen;
+setup_ntlmv2_ret:
+        *buflen = tmp - pbuffer;
+        return rc;
 }
-#endif
 int
 CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
@@ -571,26 +575,30 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
        char *str_area;
        SESSION_SETUP_ANDX *pSMB;
        __u32 capabilities;
-        int count;
+        __u16 count;
        int resp_buf_type;
        struct kvec iov[3];
        enum securityEnum type;
-        __u16 action;
+        __u16 action, bytes_remaining;
-        int bytes_remaining;
        struct key *spnego_key = NULL;
        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
-        bool first_time;
+        u16 blob_len;
+        char *ntlmsspblob = NULL;
        if (ses == NULL)
                return -EINVAL;
-        read_lock(&cifs_tcp_ses_lock);
-        first_time = is_first_ses_reconnect(ses);
-        read_unlock(&cifs_tcp_ses_lock);
        type = ses->server->secType;
        cFYI(1, "sess setup type %d", type);
+        if (type == RawNTLMSSP) {
+                /* if memory allocation is successful, caller of this function
+                 * frees it.
+                 */
+                ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+                if (!ses->ntlmssp)
+                        return -ENOMEM;
+        }
 ssetup_ntlmssp_authenticate:
        if (phase == NtLmChallenge)
                phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -655,10 +663,14 @@ ssetup_ntlmssp_authenticate:
                /* no capabilities flags in old lanman negotiation */
                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
-                /* BB calculate hash with password */
-                /* and copy into bcc */
-                calc_lanman_hash(ses->password, ses->server->cryptKey,
+                /* Calculate hash with password and copy into bcc_ptr.
+                 * Encryption Key (stored as in cryptkey) gets used if the
+                 * security mode bit in Negottiate Protocol response states
+                 * to use challenge/response method (i.e. Password bit is 1).
+                 */
+                calc_lanman_hash(ses->password, ses->server->cryptkey,
                                 ses->server->secMode & SECMODE_PW_ENCRYPT ?
                                        true : false, lnm_session_key);
@@ -676,28 +688,27 @@ ssetup_ntlmssp_authenticate:
                ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
        } else if (type == NTLM) {
-                char ntlm_session_key[CIFS_SESS_KEY_SIZE];
                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
                pSMB->req_no_secext.CaseInsensitivePasswordLength =
-                        cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                        cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                pSMB->req_no_secext.CaseSensitivePasswordLength =
-                        cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                        cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-                /* calculate session key */
+                /* calculate ntlm response and session key */
-                SMBNTencrypt(ses->password, ses->server->cryptKey,
+                rc = setup_ntlm_response(ses);
-                             ntlm_session_key);
+                if (rc) {
+                        cERROR(1, "Error %d during NTLM authentication", rc);
+                        goto ssetup_exit;
+                }
-                if (first_time) /* should this be moved into common code
+                /* copy ntlm response */
-                                  with similar ntlmv2 path? */
+                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
-                        cifs_calculate_mac_key(&ses->server->mac_signing_key,
+                                CIFS_AUTH_RESP_SIZE);
-                                ntlm_session_key, ses->password);
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
-                /* copy session key */
+                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                                CIFS_AUTH_RESP_SIZE);
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
-                memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
-                memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
                        /* unicode strings must be word aligned */
                        if (iov[0].iov_len % 2) {
@@ -708,33 +719,27 @@ ssetup_ntlmssp_authenticate:
                } else
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
        } else if (type == NTLMv2) {
-                char *v2_sess_key =
-                        kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL);
-                /* BB FIXME change all users of v2_sess_key to
-                   struct ntlmv2_resp */
-                if (v2_sess_key == NULL) {
-                        rc = -ENOMEM;
-                        goto ssetup_exit;
-                }
                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
                /* LM2 password would be here if we supported it */
                pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
-                /*      cpu_to_le16(LM2_SESS_KEY_SIZE); */
+                /* calculate nlmv2 response and session key */
+                rc = setup_ntlmv2_rsp(ses, nls_cp);
+                if (rc) {
+                        cERROR(1, "Error %d during NTLMv2 authentication", rc);
+                        goto ssetup_exit;
+                }
+                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                                ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+                bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+                /* set case sensitive password length after tilen may get
+                 * assigned, tilen is 0 otherwise.
+                 */
                pSMB->req_no_secext.CaseSensitivePasswordLength =
-                        cpu_to_le16(sizeof(struct ntlmv2_resp));
+                        cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
-                /* calculate session key */
-                setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
-                /* FIXME: calculate MAC key */
-                memcpy(bcc_ptr, (char *)v2_sess_key,
-                       sizeof(struct ntlmv2_resp));
-                bcc_ptr += sizeof(struct ntlmv2_resp);
-                kfree(v2_sess_key);
                if (ses->capabilities & CAP_UNICODE) {
                        if (iov[0].iov_len % 2) {
                                *bcc_ptr = 0;
@@ -746,6 +751,7 @@ ssetup_ntlmssp_authenticate:
        } else if (type == Kerberos) {
 #ifdef CONFIG_CIFS_UPCALL
                struct cifs_spnego_msg *msg;
                spnego_key = cifs_get_spnego_key(ses);
                if (IS_ERR(spnego_key)) {
                        rc = PTR_ERR(spnego_key);
@@ -763,19 +769,17 @@ ssetup_ntlmssp_authenticate:
                        rc = -EKEYREJECTED;
                        goto ssetup_exit;
                }
-                /* bail out if key is too long */
-                if (msg->sesskey_len >
+                ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
-                    sizeof(ses->server->mac_signing_key.data.krb5)) {
+                if (!ses->auth_key.response) {
-                        cERROR(1, "Kerberos signing key too long (%u bytes)",
+                        cERROR(1, "Kerberos can't allocate (%u bytes) memory",
-                                msg->sesskey_len);
+                                        msg->sesskey_len);
-                        rc = -EOVERFLOW;
+                        rc = -ENOMEM;
                        goto ssetup_exit;
                }
-                if (first_time) {
+                memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
-                        ses->server->mac_signing_key.len = msg->sesskey_len;
+                ses->auth_key.len = msg->sesskey_len;
-                        memcpy(ses->server->mac_signing_key.data.krb5,
-                                msg->data, msg->sesskey_len);
-                }
                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
                capabilities |= CAP_EXTENDED_SECURITY;
                pSMB->req.Capabilities = cpu_to_le32(capabilities);
@@ -799,54 +803,70 @@ ssetup_ntlmssp_authenticate:
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
-        } else {
+        } else if (type == RawNTLMSSP) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-                if (type == RawNTLMSSP) {
+                        cERROR(1, "NTLMSSP requires Unicode support");
-                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+                        rc = -ENOSYS;
-                                cERROR(1, "NTLMSSP requires Unicode support");
+                        goto ssetup_exit;
-                                rc = -ENOSYS;
+                }
+                cFYI(1, "ntlmssp session setup phase %d", phase);
+                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                capabilities |= CAP_EXTENDED_SECURITY;
+                pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+                switch(phase) {
+                case NtLmNegotiate:
+                        build_ntlmssp_negotiate_blob(
+                                pSMB->req.SecurityBlob, ses);
+                        iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+                        iov[1].iov_base = pSMB->req.SecurityBlob;
+                        pSMB->req.SecurityBlobLength =
+                                cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+                        break;
+                case NtLmAuthenticate:
+                        /*
+                         * 5 is an empirical value, large enough to hold
+                         * authenticate message plus max 10 of av paris,
+                         * domain, user, workstation names, flags, etc.
+                         */
+                        ntlmsspblob = kzalloc(
+                                5*sizeof(struct _AUTHENTICATE_MESSAGE),
+                                GFP_KERNEL);
+                        if (!ntlmsspblob) {
+                                cERROR(1, "Can't allocate NTLMSSP blob");
+                                rc = -ENOMEM;
                                goto ssetup_exit;
                        }
-                        cFYI(1, "ntlmssp session setup phase %d", phase);
+                        rc = build_ntlmssp_auth_blob(ntlmsspblob,
-                        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                                                &blob_len, ses, nls_cp);
-                        capabilities |= CAP_EXTENDED_SECURITY;
+                        if (rc)
-                        pSMB->req.Capabilities |= cpu_to_le32(capabilities);
-                        if (phase == NtLmNegotiate) {
-                                setup_ntlmssp_neg_req(pSMB, ses);
-                                iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
-                        } else if (phase == NtLmAuthenticate) {
-                                int blob_len;
-                                blob_len = setup_ntlmssp_auth_req(pSMB, ses,
-                                                                  nls_cp,
-                                                                  first_time);
-                                iov[1].iov_len = blob_len;
-                                /* Make sure that we tell the server that we
-                                   are using the uid that it just gave us back
-                                   on the response (challenge) */
-                                smb_buf->Uid = ses->Suid;
-                        } else {
-                                cERROR(1, "invalid phase %d", phase);
-                                rc = -ENOSYS;
                                goto ssetup_exit;
-                        }
+                        iov[1].iov_len = blob_len;
-                        iov[1].iov_base = &pSMB->req.SecurityBlob[0];
+                        iov[1].iov_base = ntlmsspblob;
-                        /* unicode strings must be word aligned */
+                        pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
-                        if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                        /*
-                                *bcc_ptr = 0;
+                         * Make sure that we tell the server that we are using
-                                bcc_ptr++;
+                         * the uid that it just gave us back on the response
-                        }
+                         * (challenge)
-                        unicode_oslm_strings(&bcc_ptr, nls_cp);
+                         */
-                } else {
+                        smb_buf->Uid = ses->Suid;
-                        cERROR(1, "secType %d not supported!", type);
+                        break;
+                default:
+                        cERROR(1, "invalid phase %d", phase);
                        rc = -ENOSYS;
                        goto ssetup_exit;
                }
-#else
+                /* unicode strings must be word aligned */
+                if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                        *bcc_ptr = 0;
+                        bcc_ptr++;
+                }
+                unicode_oslm_strings(&bcc_ptr, nls_cp);
+        } else {
                cERROR(1, "secType %d not supported!", type);
                rc = -ENOSYS;
                goto ssetup_exit;
-#endif
        }
        iov[2].iov_base = str_area;
@@ -855,14 +875,12 @@ ssetup_ntlmssp_authenticate:
        count = iov[1].iov_len + iov[2].iov_len;
        smb_buf->smb_buf_length += count;
-        BCC_LE(smb_buf) = cpu_to_le16(count);
+        put_bcc_le(count, smb_buf);
        rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
-                          CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
+                          CIFS_LOG_ERROR);
        /* SMB request buf freed in SendReceive2 */
-        cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
        smb_buf = (struct smb_hdr *)iov[0].iov_base;
@@ -891,11 +909,10 @@ ssetup_ntlmssp_authenticate:
        cFYI(1, "UID = %d ", ses->Suid);
        /* response can have either 3 or 4 word count - Samba sends 3 */
        /* and lanman response is 3 */
-        bytes_remaining = BCC(smb_buf);
+        bytes_remaining = get_bcc(smb_buf);
        bcc_ptr = pByteArea(smb_buf);
        if (smb_buf->WordCount == 4) {
-                __u16 blob_len;
                blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
                if (blob_len > bytes_remaining) {
                        cERROR(1, "bad security blob length %d", blob_len);
@@ -931,6 +948,8 @@ ssetup_exit:
                key_put(spnego_key);
        }
        kfree(str_area);
+        kfree(ntlmsspblob);
+        ntlmsspblob = NULL;
        if (resp_buf_type == CIFS_SMALL_BUFFER) {
                cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
                cifs_small_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index b6b6dcb500bf..04721485925d 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -45,7 +45,6 @@
   up with a different answer to the one above)
 */
 #include <linux/slab.h>
-#include "cifsencrypt.h"
 #define uchar unsigned char
 static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 192ea51af20f..b5041c849981 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -32,9 +32,8 @@
 #include "cifs_unicode.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
-#include "md5.h"
 #include "cifs_debug.h"
-#include "cifsencrypt.h"
+#include "cifsproto.h"
 #ifndef false
 #define false 0
@@ -48,14 +47,58 @@
 #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
 #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
-/*The following definitions come from  libsmb/smbencrypt.c  */
+/* produce a md4 message digest from data of length n bytes */
+int
+mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
+{
+        int rc;
+        unsigned int size;
+        struct crypto_shash *md4;
+        struct sdesc *sdescmd4;
+        md4 = crypto_alloc_shash("md4", 0, 0);
+        if (IS_ERR(md4)) {
+                rc = PTR_ERR(md4);
+                cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
+                return rc;
+        }
+        size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
+        sdescmd4 = kmalloc(size, GFP_KERNEL);
+        if (!sdescmd4) {
+                rc = -ENOMEM;
+                cERROR(1, "%s: Memory allocation failure\n", __func__);
+                goto mdfour_err;
+        }
+        sdescmd4->shash.tfm = md4;
+        sdescmd4->shash.flags = 0x0;
+        rc = crypto_shash_init(&sdescmd4->shash);
+        if (rc) {
+                cERROR(1, "%s: Could not init md4 shash\n", __func__);
+                goto mdfour_err;
+        }
+        crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+        rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
-void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+mdfour_err:
-                unsigned char *p24);
+        crypto_free_shash(md4);
-void E_md4hash(const unsigned char *passwd, unsigned char *p16);
+        kfree(sdescmd4);
-static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-                   unsigned char p24[24]);
+        return rc;
-void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
+}
+/* Does the des encryption from the NT or LM MD4 hash. */
+static void
+SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
+              unsigned char p24[24])
+{
+        unsigned char p21[21];
+        memset(p21, '\0', 21);
+        memcpy(p21, passwd, 16);
+        E_P24(p21, c8, p24);
+}
 /*
   This implements the X/Open SMB password encryption
@@ -118,9 +161,10 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
 * Creates the MD4 Hash of the users password in NT UNICODE.
 */
-void
+int
 E_md4hash(const unsigned char *passwd, unsigned char *p16)
 {
+        int rc;
        int len;
        __u16 wpwd[129];
@@ -139,8 +183,10 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
        /* Calculate length in bytes */
        len = _my_wcslen(wpwd) * sizeof(__u16);
-        mdfour(p16, (unsigned char *) wpwd, len);
+        rc = mdfour(p16, (unsigned char *) wpwd, len);
        memset(wpwd, 0, 129 * 2);
+        return rc;
 }
 #if 0 /* currently unused */
@@ -212,19 +258,6 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
 }
 #endif
-/* Does the des encryption from the NT or LM MD4 hash. */
-static void
-SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-              unsigned char p24[24])
-{
-        unsigned char p21[21];
-        memset(p21, '\0', 21);
-        memcpy(p21, passwd, 16);
-        E_P24(p21, c8, p24);
-}
 /* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
 #if 0 /* currently unused */
 static void
@@ -242,16 +275,21 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
 #endif
 /* Does the NT MD4 hash then des encryption. */
+int
-void
 SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 {
+        int rc;
        unsigned char p21[21];
        memset(p21, '\0', 21);
-        E_md4hash(passwd, p21);
+        rc = E_md4hash(passwd, p21);
+        if (rc) {
+                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+                return rc;
+        }
        SMBOWFencrypt(p21, c8, p24);
+        return rc;
 }
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d6978..fbc5aace54b1 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -36,7 +36,13 @@
 extern mempool_t *cifs_mid_poolp;
-static struct mid_q_entry *
+static void
+wake_up_task(struct mid_q_entry *mid)
+{
+        wake_up_process(mid->callback_data);
+}
+struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 {
        struct mid_q_entry *temp;
@@ -58,28 +64,28 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
        /*      do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
                /* when mid allocated can be before when sent */
                temp->when_alloc = jiffies;
-                temp->tsk = current;
+                /*
+                 * The default is for the mid to be synchronous, so the
+                 * default callback just wakes up the current task.
+                 */
+                temp->callback = wake_up_task;
+                temp->callback_data = current;
        }
-        spin_lock(&GlobalMid_Lock);
-        list_add_tail(&temp->qhead, &server->pending_mid_q);
        atomic_inc(&midCount);
        temp->midState = MID_REQUEST_ALLOCATED;
-        spin_unlock(&GlobalMid_Lock);
        return temp;
 }
-static void
+void
 DeleteMidQEntry(struct mid_q_entry *midEntry)
 {
 #ifdef CONFIG_CIFS_STATS2
        unsigned long now;
 #endif
-        spin_lock(&GlobalMid_Lock);
        midEntry->midState = MID_FREE;
-        list_del(&midEntry->qhead);
        atomic_dec(&midCount);
-        spin_unlock(&GlobalMid_Lock);
        if (midEntry->largeBuf)
                cifs_buf_release(midEntry->resp_buf);
        else
@@ -103,6 +109,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
        mempool_free(midEntry, cifs_mid_poolp);
 }
+static void
+delete_mid(struct mid_q_entry *mid)
+{
+        spin_lock(&GlobalMid_Lock);
+        list_del(&mid->qhead);
+        spin_unlock(&GlobalMid_Lock);
+        DeleteMidQEntry(mid);
+}
 static int
 smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 {
@@ -119,7 +135,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        if (ssocket == NULL)
                return -ENOTSOCK; /* BB eventually add reconnect code here */
-        smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
+        smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
        smb_msg.msg_namelen = sizeof(struct sockaddr);
        smb_msg.msg_control = NULL;
        smb_msg.msg_controllen = 0;
@@ -220,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                server->tcpStatus = CifsNeedReconnect;
        }
-        if (rc < 0) {
+        if (rc < 0 && rc != -EINTR)
                cERROR(1, "Error %d sending data on socket to server", rc);
-        } else
+        else
                rc = 0;
        /* Don't want to modify the buffer as a
@@ -244,31 +260,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
        return smb_sendv(server, &iov, 1);
 }
-static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
+static int wait_for_free_request(struct TCP_Server_Info *server,
+                                 const int long_op)
 {
        if (long_op == CIFS_ASYNC_OP) {
                /* oplock breaks must not be held up */
-                atomic_inc(&ses->server->inFlight);
+                atomic_inc(&server->inFlight);
                return 0;
        }
        spin_lock(&GlobalMid_Lock);
        while (1) {
-                if (atomic_read(&ses->server->inFlight) >=
+                if (atomic_read(&server->inFlight) >= cifs_max_pending) {
-                                cifs_max_pending){
                        spin_unlock(&GlobalMid_Lock);
 #ifdef CONFIG_CIFS_STATS2
-                        atomic_inc(&ses->server->num_waiters);
+                        atomic_inc(&server->num_waiters);
 #endif
-                        wait_event(ses->server->request_q,
+                        wait_event(server->request_q,
-                                   atomic_read(&ses->server->inFlight)
+                                   atomic_read(&server->inFlight)
                                     < cifs_max_pending);
 #ifdef CONFIG_CIFS_STATS2
-                        atomic_dec(&ses->server->num_waiters);
+                        atomic_dec(&server->num_waiters);
 #endif
                        spin_lock(&GlobalMid_Lock);
                } else {
-                        if (ses->server->tcpStatus == CifsExiting) {
+                        if (server->tcpStatus == CifsExiting) {
                                spin_unlock(&GlobalMid_Lock);
                                return -ENOENT;
                        }
@@ -278,7 +294,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
                        /* update # of requests on the wire to server */
                        if (long_op != CIFS_BLOCKING_OP)
-                                atomic_inc(&ses->server->inFlight);
+                                atomic_inc(&server->inFlight);
                        spin_unlock(&GlobalMid_Lock);
                        break;
                }
@@ -308,53 +324,85 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
        *ppmidQ = AllocMidQEntry(in_buf, ses->server);
        if (*ppmidQ == NULL)
                return -ENOMEM;
+        spin_lock(&GlobalMid_Lock);
+        list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
+        spin_unlock(&GlobalMid_Lock);
        return 0;
 }
-static int wait_for_response(struct cifsSesInfo *ses,
+static int
-                        struct mid_q_entry *midQ,
+wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
-                        unsigned long timeout,
-                        unsigned long time_to_wait)
 {
-        unsigned long curr_timeout;
+        int error;
-        for (;;) {
+        error = wait_event_killable(server->response_q,
-                curr_timeout = timeout + jiffies;
+                                    midQ->midState != MID_REQUEST_SUBMITTED);
-                wait_event_timeout(ses->server->response_q,
+        if (error < 0)
-                        midQ->midState != MID_REQUEST_SUBMITTED, timeout);
+                return -ERESTARTSYS;
-                if (time_after(jiffies, curr_timeout) &&
+        return 0;
-                        (midQ->midState == MID_REQUEST_SUBMITTED) &&
+}
-                        ((ses->server->tcpStatus == CifsGood) ||
-                         (ses->server->tcpStatus == CifsNew))) {
-                        unsigned long lrt;
-                        /* We timed out. Is the server still
+/*
-                           sending replies ? */
+ * Send a SMB request and set the callback function in the mid to handle
-                        spin_lock(&GlobalMid_Lock);
+ * the result. Caller is responsible for dealing with timeouts.
-                        lrt = ses->server->lstrp;
+ */
-                        spin_unlock(&GlobalMid_Lock);
+int
+cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+                mid_callback_t *callback, void *cbdata)
+{
+        int rc;
+        struct mid_q_entry *mid;
-                        /* Calculate time_to_wait past last receive time.
+        rc = wait_for_free_request(server, CIFS_ASYNC_OP);
-                         Although we prefer not to time out if the
+        if (rc)
-                         server is still responding - we will time
+                return rc;
-                         out if the server takes more than 15 (or 45
-                         or 180) seconds to respond to this request
+        /* enable signing if server requires it */
-                         and has not responded to any request from
+        if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                         other threads on the client within 10 seconds */
+                in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-                        lrt += time_to_wait;
-                        if (time_after(jiffies, lrt)) {
+        mutex_lock(&server->srv_mutex);
-                                /* No replies for time_to_wait. */
+        mid = AllocMidQEntry(in_buf, server);
-                                cERROR(1, "server not responding");
+        if (mid == NULL) {
-                                return -1;
+                mutex_unlock(&server->srv_mutex);
-                        }
+                return -ENOMEM;
-                } else {
-                        return 0;
-                }
        }
-}
+        /* put it on the pending_mid_q */
+        spin_lock(&GlobalMid_Lock);
+        list_add_tail(&mid->qhead, &server->pending_mid_q);
+        spin_unlock(&GlobalMid_Lock);
+        rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+        if (rc) {
+                mutex_unlock(&server->srv_mutex);
+                goto out_err;
+        }
+        mid->callback = callback;
+        mid->callback_data = cbdata;
+        mid->midState = MID_REQUEST_SUBMITTED;
+#ifdef CONFIG_CIFS_STATS2
+        atomic_inc(&server->inSend);
+#endif
+        rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+#ifdef CONFIG_CIFS_STATS2
+        atomic_dec(&server->inSend);
+        mid->when_sent = jiffies;
+#endif
+        mutex_unlock(&server->srv_mutex);
+        if (rc)
+                goto out_err;
+        return rc;
+out_err:
+        delete_mid(mid);
+        atomic_dec(&server->inFlight);
+        wake_up(&server->request_q);
+        return rc;
+}
 /*
 *
@@ -382,6 +430,81 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
        return rc;
 }
+static int
+sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+{
+        int rc = 0;
+        cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
+                mid->mid, mid->midState);
+        spin_lock(&GlobalMid_Lock);
+        /* ensure that it's no longer on the pending_mid_q */
+        list_del_init(&mid->qhead);
+        switch (mid->midState) {
+        case MID_RESPONSE_RECEIVED:
+                spin_unlock(&GlobalMid_Lock);
+                return rc;
+        case MID_REQUEST_SUBMITTED:
+                /* socket is going down, reject all calls */
+                if (server->tcpStatus == CifsExiting) {
+                        cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
+                               __func__, mid->mid, mid->command, mid->midState);
+                        rc = -EHOSTDOWN;
+                        break;
+                }
+        case MID_RETRY_NEEDED:
+                rc = -EAGAIN;
+                break;
+        default:
+                cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
+                        mid->mid, mid->midState);
+                rc = -EIO;
+        }
+        spin_unlock(&GlobalMid_Lock);
+        DeleteMidQEntry(mid);
+        return rc;
+}
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+                struct mid_q_entry *mid)
+{
+        int rc = 0;
+        /* -4 for RFC1001 length and +2 for BCC field */
+        in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4  + 2;
+        in_buf->Command = SMB_COM_NT_CANCEL;
+        in_buf->WordCount = 0;
+        put_bcc_le(0, in_buf);
+        mutex_lock(&server->srv_mutex);
+        rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+        if (rc) {
+                mutex_unlock(&server->srv_mutex);
+                return rc;
+        }
+        rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+        mutex_unlock(&server->srv_mutex);
+        cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
+                in_buf->Mid, rc);
+        return rc;
+}
 int
 SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
             struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -390,7 +513,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        int rc = 0;
        int long_op;
        unsigned int receive_len;
-        unsigned long timeout;
        struct mid_q_entry *midQ;
        struct smb_hdr *in_buf = iov[0].iov_base;
@@ -413,7 +535,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
           to the same server. We may make this configurable later or
           use ses->maxReq */
-        rc = wait_for_free_request(ses, long_op);
+        rc = wait_for_free_request(ses->server, long_op);
        if (rc) {
                cifs_small_buf_release(in_buf);
                return rc;
@@ -452,70 +574,41 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #endif
        mutex_unlock(&ses->server->srv_mutex);
-        cifs_small_buf_release(in_buf);
-        if (rc < 0)
+        if (rc < 0) {
-                goto out;
+                cifs_small_buf_release(in_buf);
-        if (long_op == CIFS_STD_OP)
-                timeout = 15 * HZ;
-        else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
-                timeout = 180 * HZ;
-        else if (long_op == CIFS_LONG_OP)
-                timeout = 45 * HZ; /* should be greater than
-                        servers oplock break timeout (about 43 seconds) */
-        else if (long_op == CIFS_ASYNC_OP)
-                goto out;
-        else if (long_op == CIFS_BLOCKING_OP)
-                timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
-        else {
-                cERROR(1, "unknown timeout flag %d", long_op);
-                rc = -EIO;
                goto out;
        }
-        /* wait for 15 seconds or until woken up due to response arriving or
+        if (long_op == CIFS_ASYNC_OP) {
-           due to last connection to this server being unmounted */
+                cifs_small_buf_release(in_buf);
-        if (signal_pending(current)) {
+                goto out;
-                /* if signal pending do not hold up user for full smb timeout
-                but we still give response a chance to complete */
-                timeout = 2 * HZ;
        }
-        /* No user interrupts in wait - wreaks havoc with performance */
+        rc = wait_for_response(ses->server, midQ);
-        wait_for_response(ses, midQ, timeout, 10 * HZ);
+        if (rc != 0) {
+                send_nt_cancel(ses->server, in_buf, midQ);
-        spin_lock(&GlobalMid_Lock);
+                spin_lock(&GlobalMid_Lock);
-        if (midQ->resp_buf == NULL) {
-                cERROR(1, "No response to cmd %d mid %d",
-                        midQ->command, midQ->mid);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
-                        if (ses->server->tcpStatus == CifsExiting)
+                        midQ->callback = DeleteMidQEntry;
-                                rc = -EHOSTDOWN;
+                        spin_unlock(&GlobalMid_Lock);
-                        else {
+                        cifs_small_buf_release(in_buf);
-                                ses->server->tcpStatus = CifsNeedReconnect;
+                        atomic_dec(&ses->server->inFlight);
-                                midQ->midState = MID_RETRY_NEEDED;
+                        wake_up(&ses->server->request_q);
-                        }
+                        return rc;
-                }
-                if (rc != -EHOSTDOWN) {
-                        if (midQ->midState == MID_RETRY_NEEDED) {
-                                rc = -EAGAIN;
-                                cFYI(1, "marking request for retry");
-                        } else {
-                                rc = -EIO;
-                        }
                }
                spin_unlock(&GlobalMid_Lock);
-                DeleteMidQEntry(midQ);
+        }
-                /* Update # of requests on wire to server */
+        cifs_small_buf_release(in_buf);
+        rc = sync_mid_result(midQ, ses->server);
+        if (rc != 0) {
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        spin_unlock(&GlobalMid_Lock);
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -543,7 +636,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                             SECMODE_SIGN_ENABLED))) {
                        rc = cifs_verify_signature(midQ->resp_buf,
-                                                &ses->server->mac_signing_key,
+                                                ses->server,
                                                midQ->sequence_number+1);
                        if (rc) {
                                cERROR(1, "Unexpected SMB signature");
@@ -559,19 +652,18 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                if (receive_len >= sizeof(struct smb_hdr) - 4
                    /* do not count RFC1001 header */  +
                    (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
-                        BCC(midQ->resp_buf) =
+                        put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
-                                le16_to_cpu(BCC_LE(midQ->resp_buf));
                if ((flags & CIFS_NO_RESP) == 0)
                        midQ->resp_buf = NULL;  /* mark it so buf will
                                                   not be freed by
-                                                   DeleteMidQEntry */
+                                                   delete_mid */
        } else {
                rc = -EIO;
                cFYI(1, "Bad MID state?");
        }
 out:
-        DeleteMidQEntry(midQ);
+        delete_mid(midQ);
        atomic_dec(&ses->server->inFlight);
        wake_up(&ses->server->request_q);
@@ -585,7 +677,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 {
        int rc = 0;
        unsigned int receive_len;
-        unsigned long timeout;
        struct mid_q_entry *midQ;
        if (ses == NULL) {
@@ -610,7 +701,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                return -EIO;
        }
-        rc = wait_for_free_request(ses, long_op);
+        rc = wait_for_free_request(ses->server, long_op);
        if (rc)
                return rc;
@@ -649,64 +740,31 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        if (rc < 0)
                goto out;
-        if (long_op == CIFS_STD_OP)
+        if (long_op == CIFS_ASYNC_OP)
-                timeout = 15 * HZ;
-        /* wait for 15 seconds or until woken up due to response arriving or
-           due to last connection to this server being unmounted */
-        else if (long_op == CIFS_ASYNC_OP)
-                goto out;
-        else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
-                timeout = 180 * HZ;
-        else if (long_op == CIFS_LONG_OP)
-                timeout = 45 * HZ; /* should be greater than
-                        servers oplock break timeout (about 43 seconds) */
-        else if (long_op == CIFS_BLOCKING_OP)
-                timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
-        else {
-                cERROR(1, "unknown timeout flag %d", long_op);
-                rc = -EIO;
                goto out;
-        }
-        if (signal_pending(current)) {
-                /* if signal pending do not hold up user for full smb timeout
-                but we still give response a chance to complete */
-                timeout = 2 * HZ;
-        }
-        /* No user interrupts in wait - wreaks havoc with performance */
-        wait_for_response(ses, midQ, timeout, 10 * HZ);
-        spin_lock(&GlobalMid_Lock);
+        rc = wait_for_response(ses->server, midQ);
-        if (midQ->resp_buf == NULL) {
+        if (rc != 0) {
-                cERROR(1, "No response for cmd %d mid %d",
+                send_nt_cancel(ses->server, in_buf, midQ);
-                          midQ->command, midQ->mid);
+                spin_lock(&GlobalMid_Lock);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
-                        if (ses->server->tcpStatus == CifsExiting)
+                        /* no longer considered to be "in-flight" */
-                                rc = -EHOSTDOWN;
+                        midQ->callback = DeleteMidQEntry;
-                        else {
+                        spin_unlock(&GlobalMid_Lock);
-                                ses->server->tcpStatus = CifsNeedReconnect;
+                        atomic_dec(&ses->server->inFlight);
-                                midQ->midState = MID_RETRY_NEEDED;
+                        wake_up(&ses->server->request_q);
-                        }
+                        return rc;
-                }
-                if (rc != -EHOSTDOWN) {
-                        if (midQ->midState == MID_RETRY_NEEDED) {
-                                rc = -EAGAIN;
-                                cFYI(1, "marking request for retry");
-                        } else {
-                                rc = -EIO;
-                        }
                }
                spin_unlock(&GlobalMid_Lock);
-                DeleteMidQEntry(midQ);
+        }
-                /* Update # of requests on wire to server */
+        rc = sync_mid_result(midQ, ses->server);
+        if (rc != 0) {
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        spin_unlock(&GlobalMid_Lock);
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -731,7 +789,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                             SECMODE_SIGN_ENABLED))) {
                        rc = cifs_verify_signature(out_buf,
-                                                &ses->server->mac_signing_key,
+                                                ses->server,
                                                midQ->sequence_number+1);
                        if (rc) {
                                cERROR(1, "Unexpected SMB signature");
@@ -748,43 +806,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                if (receive_len >= sizeof(struct smb_hdr) - 4
                    /* do not count RFC1001 header */  +
                    (2 * out_buf->WordCount) + 2 /* bcc */ )
-                        BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+                        put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
        } else {
                rc = -EIO;
                cERROR(1, "Bad MID state?");
        }
 out:
-        DeleteMidQEntry(midQ);
+        delete_mid(midQ);
        atomic_dec(&ses->server->inFlight);
        wake_up(&ses->server->request_q);
        return rc;
 }
-/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
-static int
-send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
-                struct mid_q_entry *midQ)
-{
-        int rc = 0;
-        struct cifsSesInfo *ses = tcon->ses;
-        __u16 mid = in_buf->Mid;
-        header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
-        in_buf->Mid = mid;
-        mutex_lock(&ses->server->srv_mutex);
-        rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
-        if (rc) {
-                mutex_unlock(&ses->server->srv_mutex);
-                return rc;
-        }
-        rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
-        mutex_unlock(&ses->server->srv_mutex);
-        return rc;
-}
 /* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
   blocking lock to return. */
@@ -807,7 +842,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
        pSMB->hdr.Mid = GetNextMid(ses->server);
        return SendReceive(xid, ses, in_buf, out_buf,
-                        &bytes_returned, CIFS_STD_OP);
+                        &bytes_returned, 0);
 }
 int
@@ -845,7 +880,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                return -EIO;
        }
-        rc = wait_for_free_request(ses, CIFS_BLOCKING_OP);
+        rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
        if (rc)
                return rc;
@@ -863,7 +898,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
        if (rc) {
-                DeleteMidQEntry(midQ);
+                delete_mid(midQ);
                mutex_unlock(&ses->server->srv_mutex);
                return rc;
        }
@@ -880,7 +915,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        mutex_unlock(&ses->server->srv_mutex);
        if (rc < 0) {
-                DeleteMidQEntry(midQ);
+                delete_mid(midQ);
                return rc;
        }
@@ -899,10 +934,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                if (in_buf->Command == SMB_COM_TRANSACTION2) {
                        /* POSIX lock. We send a NT_CANCEL SMB to cause the
                           blocking lock to return. */
+                        rc = send_nt_cancel(ses->server, in_buf, midQ);
-                        rc = send_nt_cancel(tcon, in_buf, midQ);
                        if (rc) {
-                                DeleteMidQEntry(midQ);
+                                delete_mid(midQ);
                                return rc;
                        }
                } else {
@@ -914,47 +948,33 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                        /* If we get -ENOLCK back the lock may have
                           already been removed. Don't exit in this case. */
                        if (rc && rc != -ENOLCK) {
-                                DeleteMidQEntry(midQ);
+                                delete_mid(midQ);
                                return rc;
                        }
                }
-                /* Wait 5 seconds for the response. */
+                rc = wait_for_response(ses->server, midQ);
-                if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) {
+                if (rc) {
-                        /* We got the response - restart system call. */
+                        send_nt_cancel(ses->server, in_buf, midQ);
-                        rstart = 1;
+                        spin_lock(&GlobalMid_Lock);
-                }
+                        if (midQ->midState == MID_REQUEST_SUBMITTED) {
-        }
+                                /* no longer considered to be "in-flight" */
+                                midQ->callback = DeleteMidQEntry;
-        spin_lock(&GlobalMid_Lock);
+                                spin_unlock(&GlobalMid_Lock);
-        if (midQ->resp_buf) {
+                                return rc;
-                spin_unlock(&GlobalMid_Lock);
-                receive_len = midQ->resp_buf->smb_buf_length;
-        } else {
-                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid);
-                if (midQ->midState == MID_REQUEST_SUBMITTED) {
-                        if (ses->server->tcpStatus == CifsExiting)
-                                rc = -EHOSTDOWN;
-                        else {
-                                ses->server->tcpStatus = CifsNeedReconnect;
-                                midQ->midState = MID_RETRY_NEEDED;
                        }
+                        spin_unlock(&GlobalMid_Lock);
                }
-                if (rc != -EHOSTDOWN) {
+                /* We got the response - restart system call. */
-                        if (midQ->midState == MID_RETRY_NEEDED) {
+                rstart = 1;
-                                rc = -EAGAIN;
-                                cFYI(1, "marking request for retry");
-                        } else {
-                                rc = -EIO;
-                        }
-                }
-                spin_unlock(&GlobalMid_Lock);
-                DeleteMidQEntry(midQ);
-                return rc;
        }
+        rc = sync_mid_result(midQ, ses->server);
+        if (rc != 0)
+                return rc;
+        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
                        receive_len, xid);
@@ -981,7 +1001,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
            (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                     SECMODE_SIGN_ENABLED))) {
                rc = cifs_verify_signature(out_buf,
-                                           &ses->server->mac_signing_key,
+                                           ses->server,
                                           midQ->sequence_number+1);
                if (rc) {
                        cERROR(1, "Unexpected SMB signature");
@@ -998,10 +1018,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        if (receive_len >= sizeof(struct smb_hdr) - 4
            /* do not count RFC1001 header */  +
            (2 * out_buf->WordCount) + 2 /* bcc */ )
-                BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+                put_bcc(get_bcc_le(out_buf), out_buf);
 out:
-        DeleteMidQEntry(midQ);
+        delete_mid(midQ);
        if (rstart && rc == -EACCES)
                return -ERESTARTSYS;
        return rc;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a1509207bfa6..eae2a1491608 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -30,10 +30,11 @@
 #define MAX_EA_VALUE_SIZE 65535
 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
+#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
 #define CIFS_XATTR_USER_PREFIX "user."
 #define CIFS_XATTR_SYSTEM_PREFIX "system."
 #define CIFS_XATTR_OS2_PREFIX "os2."
-#define CIFS_XATTR_SECURITY_PREFIX ".security"
+#define CIFS_XATTR_SECURITY_PREFIX "security."
 #define CIFS_XATTR_TRUSTED_PREFIX "trusted."
 #define XATTR_TRUSTED_PREFIX_LEN  8
 #define XATTR_SECURITY_PREFIX_LEN 9
@@ -47,9 +48,10 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 #ifdef CONFIG_CIFS_XATTR
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct super_block *sb;
-        char *full_path;
+        char *full_path = NULL;
        if (direntry == NULL)
                return -EIO;
@@ -58,16 +60,19 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
        sb = direntry->d_inode->i_sb;
        if (sb == NULL)
                return -EIO;
-        xid = GetXid();
        cifs_sb = CIFS_SB(sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto remove_ea_exit;
-                return rc;
        }
        if (ea_name == NULL) {
                cFYI(1, "Null xattr names not supported");
@@ -91,6 +96,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 remove_ea_exit:
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
 #endif
        return rc;
 }
@@ -102,6 +108,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_XATTR
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct super_block *sb;
        char *full_path;
@@ -113,16 +120,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
        sb = direntry->d_inode->i_sb;
        if (sb == NULL)
                return -EIO;
-        xid = GetXid();
        cifs_sb = CIFS_SB(sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto set_ea_exit;
-                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
@@ -132,9 +142,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                returns as xattrs */
        if (value_size > MAX_EA_VALUE_SIZE) {
                cFYI(1, "size of EA value too large");
-                kfree(full_path);
+                rc = -EOPNOTSUPP;
-                FreeXid(xid);
+                goto set_ea_exit;
-                return -EOPNOTSUPP;
        }
        if (ea_name == NULL) {
@@ -198,6 +207,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 set_ea_exit:
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
 #endif
        return rc;
 }
@@ -209,6 +219,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_XATTR
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct super_block *sb;
        char *full_path;
@@ -221,16 +232,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        if (sb == NULL)
                return -EIO;
-        xid = GetXid();
        cifs_sb = CIFS_SB(sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto get_ea_exit;
-                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
@@ -265,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->local_nls,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-                else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                        __u16 fid;
-                        int oplock = 0;
-                        struct cifs_ntsd *pacl = NULL;
-                        __u32 buflen = 0;
-                        if (experimEnabled)
-                                rc = CIFSSMBOpen(xid, pTcon, full_path,
-                                        FILE_OPEN, GENERIC_READ, 0, &fid,
-                                        &oplock, NULL, cifs_sb->local_nls,
-                                        cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        /* else rc is EOPNOTSUPP from above */
-                        if (rc == 0) {
-                                rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
-                                                      &buflen);
-                                CIFSSMBClose(xid, pTcon, fid);
-                        }
-                }
-#endif /* EXPERIMENTAL */
 #else
-                cFYI(1, "query POSIX ACL not supported yet");
+                cFYI(1, "Query POSIX ACL not supported yet");
 #endif /* CONFIG_CIFS_POSIX */
        } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
                          strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -299,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-                cFYI(1, "query POSIX default ACL not supported yet");
+                cFYI(1, "Query POSIX default ACL not supported yet");
-#endif
+#endif /* CONFIG_CIFS_POSIX */
+        } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+                                strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+                        u32 acllen;
+                        struct cifs_ntsd *pacl;
+                        pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
+                                                full_path, &acllen);
+                        if (IS_ERR(pacl)) {
+                                rc = PTR_ERR(pacl);
+                                cERROR(1, "%s: error %zd getting sec desc",
+                                                __func__, rc);
+                        } else {
+                                if (ea_value) {
+                                        if (acllen > buf_size)
+                                                acllen = -ERANGE;
+                                        else
+                                                memcpy(ea_value, pacl, acllen);
+                                }
+                                rc = acllen;
+                                kfree(pacl);
+                        }
+#else
+                cFYI(1, "Query CIFS ACL not supported yet");
+#endif /* CONFIG_CIFS_ACL */
        } else if (strncmp(ea_name,
                  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
                cFYI(1, "Trusted xattr namespace not supported yet");
@@ -323,6 +340,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 get_ea_exit:
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
 #endif
        return rc;
 }
@@ -333,6 +351,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 #ifdef CONFIG_CIFS_XATTR
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct super_block *sb;
        char *full_path;
@@ -346,18 +365,20 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
                return -EIO;
        cifs_sb = CIFS_SB(sb);
-        pTcon = cifs_sb->tcon;
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                return -EOPNOTSUPP;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto list_ea_exit;
-                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
@@ -370,8 +391,10 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+list_ea_exit:
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
 #endif
        return rc;
 }
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index a5bf5771a22a..690157876184 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -17,12 +17,12 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 static atomic_t permission_epoch = ATOMIC_INIT(0);
@@ -31,19 +31,23 @@ void coda_cache_enter(struct inode *inode, int mask)
 {
        struct coda_inode_info *cii = ITOC(inode);
+        spin_lock(&cii->c_lock);
        cii->c_cached_epoch = atomic_read(&permission_epoch);
        if (cii->c_uid != current_fsuid()) {
                cii->c_uid = current_fsuid();
                cii->c_cached_perm = mask;
        } else
                cii->c_cached_perm |= mask;
+        spin_unlock(&cii->c_lock);
 }
 /* remove cached acl from an inode */
 void coda_cache_clear_inode(struct inode *inode)
 {
        struct coda_inode_info *cii = ITOC(inode);
+        spin_lock(&cii->c_lock);
        cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
+        spin_unlock(&cii->c_lock);
 }
 /* remove all acl caches */
@@ -57,13 +61,15 @@ void coda_cache_clear_all(struct super_block *sb)
 int coda_cache_check(struct inode *inode, int mask)
 {
        struct coda_inode_info *cii = ITOC(inode);
-        int hit;
+        int hit;
        
-        hit = (mask & cii->c_cached_perm) == mask &&
+        spin_lock(&cii->c_lock);
-                cii->c_uid == current_fsuid() &&
+        hit = (mask & cii->c_cached_perm) == mask &&
-                cii->c_cached_epoch == atomic_read(&permission_epoch);
+            cii->c_uid == current_fsuid() &&
+            cii->c_cached_epoch == atomic_read(&permission_epoch);
+        spin_unlock(&cii->c_lock);
-        return hit;
+        return hit;
 }
@@ -86,7 +92,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
        struct list_head *child;
        struct dentry *de;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        list_for_each(child, &parent->d_subdirs)
        {
                de = list_entry(child, struct dentry, d_u.d_child);
@@ -95,7 +101,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
                        continue;
                coda_flag_inode(de->d_inode, flag);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
        return; 
 }
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index a7a780929eec..6475877b0763 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -7,9 +7,8 @@
 #include <linux/time.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
 {
@@ -45,13 +44,15 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
 static int coda_test_inode(struct inode *inode, void *data)
 {
        struct CodaFid *fid = (struct CodaFid *)data;
-        return coda_fideq(&(ITOC(inode)->c_fid), fid);
+        struct coda_inode_info *cii = ITOC(inode);
+        return coda_fideq(&cii->c_fid, fid);
 }
 static int coda_set_inode(struct inode *inode, void *data)
 {
        struct CodaFid *fid = (struct CodaFid *)data;
-        ITOC(inode)->c_fid = *fid;
+        struct coda_inode_info *cii = ITOC(inode);
+        cii->c_fid = *fid;
        return 0;
 }
@@ -71,6 +72,7 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
                cii = ITOC(inode);
                /* we still need to set i_ino for things like stat(2) */
                inode->i_ino = hash;
+                /* inode is locked and unique, no need to grab cii->c_lock */
                cii->c_mapcount = 0;
                unlock_new_inode(inode);
        }
@@ -107,14 +109,20 @@ int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_bloc
 }
+/* Although we treat Coda file identifiers as immutable, there is one
+ * special case for files created during a disconnection where they may
+ * not be globally unique. When an identifier collision is detected we
+ * first try to flush the cached inode from the kernel and finally
+ * resort to renaming/rehashing in-place. Userspace remembers both old
+ * and new values of the identifier to handle any in-flight upcalls.
+ * The real solution is to use globally unique UUIDs as identifiers, but
+ * retrofitting the existing userspace code for this is non-trivial. */
 void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 
                      struct CodaFid *newfid)
 {
-        struct coda_inode_info *cii;
+        struct coda_inode_info *cii = ITOC(inode);
        unsigned long hash = coda_f2i(newfid);
        
-        cii = ITOC(inode);
        BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
        /* replace fid and rehash inode */
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
new file mode 100644
index 000000000000..c910b5eb1ceb
--- /dev/null
+++ b/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
+/* Coda filesystem -- Linux Minicache
+ *
+ * Copyright (C) 1989 - 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project. Contact Peter Braam
+ * <coda@cs.cmu.edu>
+ */
+#ifndef _CFSNC_HEADER_
+#define _CFSNC_HEADER_
+/* credential cache */
+void coda_cache_enter(struct inode *inode, int mask);
+void coda_cache_clear_inode(struct inode *);
+void coda_cache_clear_all(struct super_block *sb);
+int coda_cache_check(struct inode *inode, int mask);
+/* for downcalls and attributes and lookups */
+void coda_flag_inode_children(struct inode *inode, int flag);
+#endif /* _CFSNC_HEADER_ */
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
new file mode 100644
index 000000000000..e35071b1de0e
--- /dev/null
+++ b/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
+/*
+ *  coda_fs_i.h
+ *
+ *  Copyright (C) 1998 Carnegie Mellon University
+ *
+ */
+#ifndef _LINUX_CODA_FS_I
+#define _LINUX_CODA_FS_I
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/coda.h>
+/*
+ * coda fs inode data
+ * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
+ * c_cached_perm.
+ * vfs_inode is set only when the inode is created and never changes.
+ * c_fid is set when the inode is created and should be considered immutable.
+ */
+struct coda_inode_info {
+        struct CodaFid     c_fid;       /* Coda identifier */
+        u_short            c_flags;     /* flags (see below) */
+        unsigned int       c_mapcount;  /* nr of times this inode is mapped */
+        unsigned int       c_cached_epoch; /* epoch for cached permissions */
+        vuid_t             c_uid;       /* fsuid for cached permissions */
+        unsigned int       c_cached_perm; /* cached access permissions */
+        spinlock_t         c_lock;
+        struct inode       vfs_inode;
+};
+/*
+ * coda fs file private data
+ */
+#define CODA_MAGIC 0xC0DAC0DA
+struct coda_file_info {
+        int                cfi_magic;     /* magic number */
+        struct file       *cfi_container; /* container file for this cnode */
+        unsigned int       cfi_mapcount;  /* nr of times this file is mapped */
+};
+#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
+/* flags */
+#define C_VATTR       0x1   /* Validity of vattr in inode */
+#define C_FLUSH       0x2   /* used after a flush */
+#define C_DYING       0x4   /* from venus (which died) */
+#define C_PURGE       0x8
+int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
+int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
+void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
+#endif
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index bf4a3fd3c8e3..2bdbcc11b373 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -17,9 +17,8 @@
 #include <linux/string.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
 /* initialize the debugging variables */
 int coda_fake_statfs;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
new file mode 100644
index 000000000000..9b0c5323890b
--- /dev/null
+++ b/fs/coda/coda_linux.h
@@ -0,0 +1,101 @@
+/* 
+ * Coda File System, Linux Kernel module
+ * 
+ * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
+ * Linux modifications (C) 1996, Peter J. Braam
+ * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project.
+ */
+#ifndef _LINUX_CODA_FS
+#define _LINUX_CODA_FS
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/wait.h>         
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "coda_fs_i.h"
+/* operations */
+extern const struct inode_operations coda_dir_inode_operations;
+extern const struct inode_operations coda_file_inode_operations;
+extern const struct inode_operations coda_ioctl_inode_operations;
+extern const struct dentry_operations coda_dentry_operations;
+extern const struct address_space_operations coda_file_aops;
+extern const struct address_space_operations coda_symlink_aops;
+extern const struct file_operations coda_dir_operations;
+extern const struct file_operations coda_file_operations;
+extern const struct file_operations coda_ioctl_operations;
+/* operations shared over more than one file */
+int coda_open(struct inode *i, struct file *f);
+int coda_release(struct inode *i, struct file *f);
+int coda_permission(struct inode *inode, int mask, unsigned int flags);
+int coda_revalidate_inode(struct dentry *);
+int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int coda_setattr(struct dentry *, struct iattr *);
+/* this file:  heloers */
+char *coda_f2s(struct CodaFid *f);
+int coda_isroot(struct inode *i);
+int coda_iscontrol(const char *name, size_t length);
+void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
+void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
+unsigned short coda_flags_to_cflags(unsigned short);
+/* sysctl.h */
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
+#define CODA_ALLOC(ptr, cast, size) do { \
+    if (size < PAGE_SIZE) \
+        ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+    else \
+        ptr = (cast)vmalloc((unsigned long) size); \
+    if (!ptr) \
+        printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
+    else memset( ptr, 0, size ); \
+} while (0)
+#define CODA_FREE(ptr,size) \
+    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+/* inode to cnode access functions */
+static inline struct coda_inode_info *ITOC(struct inode *inode)
+{
+        return list_entry(inode, struct coda_inode_info, vfs_inode);
+}
+static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
+{
+        return &(ITOC(inode)->c_fid);
+}
+static __inline__ char *coda_i2s(struct inode *inode)
+{
+        return coda_f2s(&(ITOC(inode)->c_fid));
+}
+/* this will not zap the inode away */
+static __inline__ void coda_flag_inode(struct inode *inode, int flag)
+{
+        struct coda_inode_info *cii = ITOC(inode);
+        spin_lock(&cii->c_lock);
+        cii->c_flags |= flag;
+        spin_unlock(&cii->c_lock);
+}               
+#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ccd98b0f2b0b..2b8dae4d121e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -17,15 +17,15 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 #include "coda_int.h"
@@ -47,7 +47,7 @@ static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
 /* dentry ops */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
-static int coda_dentry_delete(struct dentry *);
+static int coda_dentry_delete(const struct dentry *);
 /* support routines */
 static int coda_venus_readdir(struct file *coda_file, void *buf,
@@ -60,7 +60,7 @@ static int coda_return_EIO(void)
 }
 #define CODA_EIO_ERROR ((void *) (coda_return_EIO))
-static const struct dentry_operations coda_dentry_operations =
+const struct dentry_operations coda_dentry_operations =
 {
        .d_revalidate   = coda_dentry_revalidate,
        .d_delete       = coda_dentry_delete,
@@ -116,21 +116,15 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
                goto exit;
        }
-        lock_kernel();
        error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
                             &type, &resfid);
        if (!error)
                error = coda_cnode_make(&inode, &resfid, dir->i_sb);
-        unlock_kernel();
        if (error && error != -ENOENT)
                return ERR_PTR(error);
 exit:
-        entry->d_op = &coda_dentry_operations;
        if (inode && (type & CODA_NOCACHE))
                coda_flag_inode(inode, C_VATTR | C_PURGE);
@@ -138,30 +132,29 @@ exit:
 }
-int coda_permission(struct inode *inode, int mask)
+int coda_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        int error = 0;
+        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
        if (!mask)
-                return 0; 
+                return 0;
        if ((mask & MAY_EXEC) && !execute_ok(inode))
                return -EACCES;
-        lock_kernel();
        if (coda_cache_check(inode, mask))
-                goto out; 
+                return 0;
-        error = venus_access(inode->i_sb, coda_i2f(inode), mask);
+        error = venus_access(inode->i_sb, coda_i2f(inode), mask);
    
        if (!error)
                coda_cache_enter(inode, mask);
- out:
-        unlock_kernel();
        return error;
 }
@@ -200,41 +193,34 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
 /* creation routines: create, mknod, mkdir, link, symlink */
 static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
 {
-        int error=0;
+        int error;
        const char *name=de->d_name.name;
        int length=de->d_name.len;
        struct inode *inode;
        struct CodaFid newfid;
        struct coda_vattr attrs;
-        lock_kernel();
+        if (coda_isroot(dir) && coda_iscontrol(name, length))
-        if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-                unlock_kernel();
                return -EPERM;
-        }
        error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 
                                0, mode, &newfid, &attrs);
+        if (error)
-        if ( error ) {
+                goto err_out;
-                unlock_kernel();
-                d_drop(de);
-                return error;
-        }
        inode = coda_iget(dir->i_sb, &newfid, &attrs);
-        if ( IS_ERR(inode) ) {
+        if (IS_ERR(inode)) {
-                unlock_kernel();
+                error = PTR_ERR(inode);
-                d_drop(de);
+                goto err_out;
-                return PTR_ERR(inode);
        }
        /* invalidate the directory cnode's attributes */
        coda_dir_update_mtime(dir);
-        unlock_kernel();
        d_instantiate(de, inode);
        return 0;
+err_out:
+        d_drop(de);
+        return error;
 }
 static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
@@ -246,36 +232,29 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
        int error;
        struct CodaFid newfid;
-        lock_kernel();
+        if (coda_isroot(dir) && coda_iscontrol(name, len))
-        if (coda_isroot(dir) && coda_iscontrol(name, len)) {
-                unlock_kernel();
                return -EPERM;
-        }
        attrs.va_mode = mode;
        error = venus_mkdir(dir->i_sb, coda_i2f(dir), 
                               name, len, &newfid, &attrs);
-        
+        if (error)
-        if ( error ) {
+                goto err_out;
-                unlock_kernel();
-                d_drop(de);
-                return error;
-        }
         
        inode = coda_iget(dir->i_sb, &newfid, &attrs);
-        if ( IS_ERR(inode) ) {
+        if (IS_ERR(inode)) {
-                unlock_kernel();
+                error = PTR_ERR(inode);
-                d_drop(de);
+                goto err_out;
-                return PTR_ERR(inode);
        }
        /* invalidate the directory cnode's attributes */
        coda_dir_inc_nlink(dir);
        coda_dir_update_mtime(dir);
-        unlock_kernel();
        d_instantiate(de, inode);
        return 0;
+err_out:
+        d_drop(de);
+        return error;
 }
 /* try to make de an entry in dir_inodde linked to source_de */ 
@@ -287,52 +266,38 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
        int len = de->d_name.len;
        int error;
-        lock_kernel();
+        if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
-        if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
-                unlock_kernel();
                return -EPERM;
-        }
        error = venus_link(dir_inode->i_sb, coda_i2f(inode),
                           coda_i2f(dir_inode), (const char *)name, len);
        if (error) {
                d_drop(de);
-                goto out;
+                return error;
        }
        coda_dir_update_mtime(dir_inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(de, inode);
        inc_nlink(inode);
+        return 0;
-out:
-        unlock_kernel();
-        return(error);
 }
 static int coda_symlink(struct inode *dir_inode, struct dentry *de,
                        const char *symname)
 {
-        const char *name = de->d_name.name;
+        const char *name = de->d_name.name;
        int len = de->d_name.len;
        int symlen;
-        int error = 0;
+        int error;
-        lock_kernel();
-        if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
+        if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
-                unlock_kernel();
                return -EPERM;
-        }
        symlen = strlen(symname);
-        if ( symlen > CODA_MAXPATHLEN ) {
+        if (symlen > CODA_MAXPATHLEN)
-                unlock_kernel();
                return -ENAMETOOLONG;
-        }
        /*
         * This entry is now negative. Since we do not create
@@ -343,10 +308,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
                              symname, symlen);
        /* mtime is no good anymore */
-        if ( !error )
+        if (!error)
                coda_dir_update_mtime(dir_inode);
-        unlock_kernel();
        return error;
 }
@@ -357,17 +321,12 @@ static int coda_unlink(struct inode *dir, struct dentry *de)
        const char *name = de->d_name.name;
        int len = de->d_name.len;
-        lock_kernel();
        error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
-        if ( error ) {
+        if (error)
-                unlock_kernel();
                return error;
-        }
        coda_dir_update_mtime(dir);
        drop_nlink(de->d_inode);
-        unlock_kernel();
        return 0;
 }
@@ -377,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
        int len = de->d_name.len;
        int error;
-        lock_kernel();
        error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
        if (!error) {
                /* VFS may delete the child */
@@ -389,7 +346,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
                coda_dir_drop_nlink(dir);
                coda_dir_update_mtime(dir);
        }
-        unlock_kernel();
        return error;
 }
@@ -403,15 +359,12 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
        int new_length = new_dentry->d_name.len;
        int error;
-        lock_kernel();
        error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
                             coda_i2f(new_dir), old_length, new_length,
                             (const char *) old_name, (const char *)new_name);
+        if (!error) {
-        if ( !error ) {
+                if (new_dentry->d_inode) {
-                if ( new_dentry->d_inode ) {
+                        if (S_ISDIR(new_dentry->d_inode->i_mode)) {
-                        if ( S_ISDIR(new_dentry->d_inode->i_mode) ) {
                                coda_dir_drop_nlink(old_dir);
                                coda_dir_inc_nlink(new_dir);
                        }
@@ -423,8 +376,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
                        coda_flag_inode(new_dir, C_VATTR);
                }
        }
-        unlock_kernel();
        return error;
 }
@@ -591,13 +542,14 @@ out:
 /* called when a cache lookup succeeds */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 {
-        struct inode *inode = de->d_inode;
+        struct inode *inode;
        struct coda_inode_info *cii;
-        if (!inode)
+        if (nd->flags & LOOKUP_RCU)
-                return 1;
+                return -ECHILD;
-        lock_kernel();
-        if (coda_isroot(inode))
+        inode = de->d_inode;
+        if (!inode || coda_isroot(inode))
                goto out;
        if (is_bad_inode(inode))
                goto bad;
@@ -612,18 +564,17 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
        if (cii->c_flags & C_FLUSH) 
                coda_flag_inode_children(inode, C_FLUSH);
-        if (atomic_read(&de->d_count) > 1)
+        if (de->d_count > 1)
                /* pretend it's valid, but don't change the flags */
                goto out;
        /* clear the flags. */
+        spin_lock(&cii->c_lock);
        cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+        spin_unlock(&cii->c_lock);
 bad:
-        unlock_kernel();
        return 0;
 out:
-        unlock_kernel();
        return 1;
 }
@@ -631,7 +582,7 @@ out:
 * This is the callback from dput() when d_count is going to 0.
 * We use this to unhash dentries with bad inodes.
 */
-static int coda_dentry_delete(struct dentry * dentry)
+static int coda_dentry_delete(const struct dentry * dentry)
 {
        int flags;
@@ -656,20 +607,19 @@ static int coda_dentry_delete(struct dentry * dentry)
 int coda_revalidate_inode(struct dentry *dentry)
 {
        struct coda_vattr attr;
-        int error = 0;
+        int error;
        int old_mode;
        ino_t old_ino;
        struct inode *inode = dentry->d_inode;
        struct coda_inode_info *cii = ITOC(inode);
-        lock_kernel();
+        if (!cii->c_flags)
-        if ( !cii->c_flags )
+                return 0;
-                goto ok;
        if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
                error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
-                if ( error )
+                if (error)
-                        goto return_bad;
+                        return -EIO;
                /* this inode may be lost if:
                   - it's ino changed 
@@ -688,17 +638,13 @@ int coda_revalidate_inode(struct dentry *dentry)
                /* the following can happen when a local fid is replaced 
                   with a global one, here we lose and declare the inode bad */
                if (inode->i_ino != old_ino)
-                        goto return_bad;
+                        return -EIO;
                
                coda_flag_inode_children(inode, C_FLUSH);
+                spin_lock(&cii->c_lock);
                cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+                spin_unlock(&cii->c_lock);
        }
-ok:
-        unlock_kernel();
        return 0;
-return_bad:
-        unlock_kernel();
-        return -EIO;
 }
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ad3cd2abeeb4..0433057be330 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -15,16 +15,15 @@
 #include <linux/stat.h>
 #include <linux/cred.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 #include "coda_int.h"
 static ssize_t
@@ -109,19 +108,24 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
        coda_inode = coda_file->f_path.dentry->d_inode;
        host_inode = host_file->f_path.dentry->d_inode;
+        cii = ITOC(coda_inode);
+        spin_lock(&cii->c_lock);
        coda_file->f_mapping = host_file->f_mapping;
        if (coda_inode->i_mapping == &coda_inode->i_data)
                coda_inode->i_mapping = host_inode->i_mapping;
        /* only allow additional mmaps as long as userspace isn't changing
         * the container file on us! */
-        else if (coda_inode->i_mapping != host_inode->i_mapping)
+        else if (coda_inode->i_mapping != host_inode->i_mapping) {
+                spin_unlock(&cii->c_lock);
                return -EBUSY;
+        }
        /* keep track of how often the coda_inode/host_file has been mmapped */
-        cii = ITOC(coda_inode);
        cii->c_mapcount++;
        cfi->cfi_mapcount++;
+        spin_unlock(&cii->c_lock);
        return host_file->f_op->mmap(host_file, vma);
 }
@@ -138,8 +142,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
        if (!cfi)
                return -ENOMEM;
-        lock_kernel();
        error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
                           &host_file);
        if (!host_file)
@@ -147,7 +149,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
        if (error) {
                kfree(cfi);
-                unlock_kernel();
                return error;
        }
@@ -159,8 +160,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
        BUG_ON(coda_file->private_data != NULL);
        coda_file->private_data = cfi;
-        unlock_kernel();
        return 0;
 }
@@ -171,9 +170,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        struct coda_file_info *cfi;
        struct coda_inode_info *cii;
        struct inode *host_inode;
-        int err = 0;
+        int err;
-        lock_kernel();
        cfi = CODA_FTOC(coda_file);
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -185,18 +182,18 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        cii = ITOC(coda_inode);
        /* did we mmap this file? */
+        spin_lock(&cii->c_lock);
        if (coda_inode->i_mapping == &host_inode->i_data) {
                cii->c_mapcount -= cfi->cfi_mapcount;
                if (!cii->c_mapcount)
                        coda_inode->i_mapping = &coda_inode->i_data;
        }
+        spin_unlock(&cii->c_lock);
        fput(cfi->cfi_container);
        kfree(coda_file->private_data);
        coda_file->private_data = NULL;
-        unlock_kernel();
        /* VFS fput ignores the return value from file_operations->release, so
         * there is no use returning an error here */
        return 0;
@@ -207,7 +204,7 @@ int coda_fsync(struct file *coda_file, int datasync)
        struct file *host_file;
        struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
        struct coda_file_info *cfi;
-        int err = 0;
+        int err;
        if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
              S_ISLNK(coda_inode->i_mode)))
@@ -218,11 +215,8 @@ int coda_fsync(struct file *coda_file, int datasync)
        host_file = cfi->cfi_container;
        err = vfs_fsync(host_file, datasync);
-        if ( !err && !datasync ) {
+        if (!err && !datasync)
-                lock_kernel();
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
-                unlock_kernel();
-        }
        return err;
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6526e6f21ecf..871b27715465 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -15,7 +15,8 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
@@ -27,10 +28,9 @@
 #include <linux/vmalloc.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 #include "coda_int.h"
@@ -44,21 +44,29 @@ static struct kmem_cache * coda_inode_cachep;
 static struct inode *coda_alloc_inode(struct super_block *sb)
 {
        struct coda_inode_info *ei;
-        ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
+        ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        memset(&ei->c_fid, 0, sizeof(struct CodaFid));
        ei->c_flags = 0;
        ei->c_uid = 0;
        ei->c_cached_perm = 0;
+        spin_lock_init(&ei->c_lock);
        return &ei->vfs_inode;
 }
-static void coda_destroy_inode(struct inode *inode)
+static void coda_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
+static void coda_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, coda_i_callback);
+}
 static void init_once(void *foo)
 {
        struct coda_inode_info *ei = (struct coda_inode_info *) foo;
@@ -143,7 +151,7 @@ static int get_device_index(struct coda_mount_data *data)
 static int coda_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct inode *root = NULL;
-        struct venus_comm *vc = NULL;
+        struct venus_comm *vc;
        struct CodaFid fid;
        int error;
        int idx;
@@ -157,21 +165,26 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
        vc = &coda_comms[idx];
+        mutex_lock(&vc->vc_mutex);
        if (!vc->vc_inuse) {
                printk("coda_read_super: No pseudo device\n");
-                return -EINVAL;
+                error = -EINVAL;
+                goto unlock_out;
        }
-        if ( vc->vc_sb ) {
+        if (vc->vc_sb) {
                printk("coda_read_super: Device already mounted\n");
-                return -EBUSY;
+                error = -EBUSY;
+                goto unlock_out;
        }
        error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
        if (error)
-                goto bdi_err;
+                goto unlock_out;
        vc->vc_sb = sb;
+        mutex_unlock(&vc->vc_mutex);
        sb->s_fs_info = vc;
        sb->s_flags |= MS_NOATIME;
@@ -179,6 +192,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_blocksize_bits = 12;
        sb->s_magic = CODA_SUPER_MAGIC;
        sb->s_op = &coda_super_operations;
+        sb->s_d_op = &coda_dentry_operations;
        sb->s_bdi = &vc->bdi;
        /* get root fid from Venus: this needs the root inode */
@@ -200,26 +214,33 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        printk("coda_read_super: rootinode is %ld dev %s\n", 
               root->i_ino, root->i_sb->s_id);
        sb->s_root = d_alloc_root(root);
-        if (!sb->s_root)
+        if (!sb->s_root) {
+                error = -EINVAL;
                goto error;
-        return 0;
+        }
+        return 0;
- error:
+error:
-        bdi_destroy(&vc->bdi);
- bdi_err:
        if (root)
                iput(root);
-        if (vc)
-                vc->vc_sb = NULL;
-        return -EINVAL;
+        mutex_lock(&vc->vc_mutex);
+        bdi_destroy(&vc->bdi);
+        vc->vc_sb = NULL;
+        sb->s_fs_info = NULL;
+unlock_out:
+        mutex_unlock(&vc->vc_mutex);
+        return error;
 }
 static void coda_put_super(struct super_block *sb)
 {
-        bdi_destroy(&coda_vcp(sb)->bdi);
+        struct venus_comm *vcp = coda_vcp(sb);
-        coda_vcp(sb)->vc_sb = NULL;
+        mutex_lock(&vcp->vc_mutex);
+        bdi_destroy(&vcp->bdi);
+        vcp->vc_sb = NULL;
        sb->s_fs_info = NULL;
+        mutex_unlock(&vcp->vc_mutex);
        printk("Coda: Bye bye.\n");
 }
@@ -245,8 +266,6 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
        struct coda_vattr vattr;
        int error;
-        lock_kernel();
-        
        memset(&vattr, 0, sizeof(vattr)); 
        inode->i_ctime = CURRENT_TIME_SEC;
@@ -256,13 +275,10 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
        /* Venus is responsible for truncating the container-file!!! */
        error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
-        if ( !error ) {
+        if (!error) {
                coda_vattr_to_iattr(inode, &vattr); 
                coda_cache_clear_inode(inode);
        }
-        unlock_kernel();
        return error;
 }
@@ -276,12 +292,8 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        int error;
        
-        lock_kernel();
        error = venus_statfs(dentry, buf);
-        unlock_kernel();
        if (error) {
                /* fake something like AFS does */
                buf->f_blocks = 9000000;
@@ -301,16 +313,16 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 /* init_coda: used by filesystems.c to register coda */
-static int coda_get_sb(struct file_system_type *fs_type,
+static struct dentry *coda_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, coda_fill_super);
 }
 struct file_system_type coda_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "coda",
-        .get_sb         = coda_get_sb,
+        .mount          = coda_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index ca25d96d45c9..6cbb3afb36dc 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -19,14 +19,12 @@
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
-#include <linux/smp_lock.h>
+#include "coda_linux.h"
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask);
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
                        unsigned long user_data);
@@ -39,11 +37,14 @@ const struct inode_operations coda_ioctl_inode_operations = {
 const struct file_operations coda_ioctl_operations = {
        .owner          = THIS_MODULE,
        .unlocked_ioctl = coda_pioctl,
+        .llseek         = noop_llseek,
 };
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask)
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return (mask & MAY_EXEC) ? -EACCES : 0;
 }
@@ -57,13 +58,9 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
        struct inode *target_inode = NULL;
        struct coda_inode_info *cnp;
-        lock_kernel();
        /* get the Pioctl data arguments from user space */
-        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
+        if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
-                error = -EINVAL;
+                return -EINVAL;
-                goto out;
-        }
        /*
         * Look up the pathname. Note that the pathname is in
@@ -75,13 +72,12 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
                error = user_lpath(data.path, &path);
        if (error)
-                goto out;
+                return error;
-        else
-                target_inode = path.dentry->d_inode;
+        target_inode = path.dentry->d_inode;
        /* return if it is not a Coda inode */
        if (target_inode->i_sb != inode->i_sb) {
-                path_put(&path);
                error = -EINVAL;
                goto out;
        }
@@ -90,10 +86,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
        cnp = ITOC(target_inode);
        error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
-        path_put(&path);
 out:
-        unlock_kernel();
+        path_put(&path);
        return error;
 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 116af7546cf0..8f616e0e252c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -35,7 +35,7 @@
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/device.h>
 #include <asm/io.h>
 #include <asm/system.h>
@@ -43,10 +43,10 @@
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 #include "coda_int.h"
 /* statistics */
@@ -67,8 +67,10 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
        unsigned int mask = POLLOUT | POLLWRNORM;
        poll_wait(file, &vcp->vc_waitq, wait);
+        mutex_lock(&vcp->vc_mutex);
        if (!list_empty(&vcp->vc_pending))
                mask |= POLLIN | POLLRDNORM;
+        mutex_unlock(&vcp->vc_mutex);
        return mask;
 }
@@ -108,16 +110,9 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
                return -EFAULT;
        if (DOWNCALL(hdr.opcode)) {
-                struct super_block *sb = NULL;
+                union outputArgs *dcbuf;
-                union outputArgs *dcbuf;
                int size = sizeof(*dcbuf);
-                sb = vcp->vc_sb;
-                if ( !sb ) {
-                        count = nbytes;
-                        goto out;
-                }
                if  ( nbytes < sizeof(struct coda_out_hdr) ) {
                        printk("coda_downcall opc %d uniq %d, not enough!\n",
                               hdr.opcode, hdr.unique);
@@ -137,9 +132,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
                }
                /* what downcall errors does Venus handle ? */
-                lock_kernel();
+                error = coda_downcall(vcp, hdr.opcode, dcbuf);
-                error = coda_downcall(hdr.opcode, dcbuf, sb);
-                unlock_kernel();
                CODA_FREE(dcbuf, nbytes);
                if (error) {
@@ -152,7 +145,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
        }
        
        /* Look for the message on the processing queue. */
-        lock_kernel();
+        mutex_lock(&vcp->vc_mutex);
        list_for_each(lh, &vcp->vc_processing) {
                tmp = list_entry(lh, struct upc_req , uc_chain);
                if (tmp->uc_unique == hdr.unique) {
@@ -161,7 +154,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
                        break;
                }
        }
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        if (!req) {
                printk("psdev_write: msg (%d, %d) not found\n", 
@@ -216,7 +209,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
        if (nbytes == 0)
                return 0;
-        lock_kernel();
+        mutex_lock(&vcp->vc_mutex);
        add_wait_queue(&vcp->vc_waitq, &wait);
        set_current_state(TASK_INTERRUPTIBLE);
@@ -230,7 +223,9 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
                        retval = -ERESTARTSYS;
                        break;
                }
+                mutex_unlock(&vcp->vc_mutex);
                schedule();
+                mutex_lock(&vcp->vc_mutex);
        }
        set_current_state(TASK_RUNNING);
@@ -263,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
        CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
        kfree(req);
 out:
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        return (count ? count : retval);
 }
@@ -276,10 +271,10 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
        if (idx < 0 || idx >= MAX_CODADEVS)
                return -ENODEV;
-        lock_kernel();
        err = -EBUSY;
        vcp = &coda_comms[idx];
+        mutex_lock(&vcp->vc_mutex);
        if (!vcp->vc_inuse) {
                vcp->vc_inuse++;
@@ -293,7 +288,7 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
                err = 0;
        }
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        return err;
 }
@@ -308,7 +303,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
                return -1;
        }
-        lock_kernel();
+        mutex_lock(&vcp->vc_mutex);
        /* Wakeup clients so they can return. */
        list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
@@ -333,7 +328,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
        file->private_data = NULL;
        vcp->vc_inuse--;
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        return 0;
 }
@@ -346,6 +341,7 @@ static const struct file_operations coda_psdev_fops = {
        .unlocked_ioctl = coda_psdev_ioctl,
        .open           = coda_psdev_open,
        .release        = coda_psdev_release,
+        .llseek         = noop_llseek,
 };
 static int init_coda_psdev(void)
@@ -361,9 +357,11 @@ static int init_coda_psdev(void)
                err = PTR_ERR(coda_psdev_class);
                goto out_chrdev;
        }               
-        for (i = 0; i < MAX_CODADEVS; i++)
+        for (i = 0; i < MAX_CODADEVS; i++) {
+                mutex_init(&(&coda_comms[i])->vc_mutex);
                device_create(coda_psdev_class, NULL,
                              MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
+        }
        coda_sysctl_init();
        goto out;
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 4513b7258458..ab94ef63caef 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -14,12 +14,11 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
 static int coda_symlink_filler(struct file *file, struct page *page)
 {
@@ -29,11 +28,9 @@ static int coda_symlink_filler(struct file *file, struct page *page)
        unsigned int len = PAGE_SIZE;
        char *p = kmap(page);
-        lock_kernel();
        cii = ITOC(inode);
        error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
-        unlock_kernel();
        if (error)
                goto fail;
        SetPageUptodate(page);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b8893ab6f9e6..9727e0c52579 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,15 +27,15 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 #include "coda_int.h"
@@ -606,7 +606,8 @@ static void coda_unblock_signals(sigset_t *old)
                                 (r)->uc_opcode != CODA_RELEASE) || \
                                (r)->uc_flags & CODA_REQ_READ))
-static inline void coda_waitfor_upcall(struct upc_req *req)
+static inline void coda_waitfor_upcall(struct venus_comm *vcp,
+                                       struct upc_req *req)
 {
        DECLARE_WAITQUEUE(wait, current);
        unsigned long timeout = jiffies + coda_timeout * HZ;
@@ -639,10 +640,12 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
                        break;
                }
+                mutex_unlock(&vcp->vc_mutex);
                if (blocked)
                        schedule_timeout(HZ);
                else
                        schedule();
+                mutex_lock(&vcp->vc_mutex);
        }
        if (blocked)
                coda_unblock_signals(&old);
@@ -667,18 +670,23 @@ static int coda_upcall(struct venus_comm *vcp,
 {
        union outputArgs *out;
        union inputArgs *sig_inputArgs;
-        struct upc_req *req, *sig_req;
+        struct upc_req *req = NULL, *sig_req;
-        int error = 0;
+        int error;
+        mutex_lock(&vcp->vc_mutex);
        if (!vcp->vc_inuse) {
                printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
-                return -ENXIO;
+                error = -ENXIO;
+                goto exit;
        }
        /* Format the request message. */
        req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
-        if (!req)
+        if (!req) {
-                return -ENOMEM;
+                error = -ENOMEM;
+                goto exit;
+        }
        req->uc_data = (void *)buffer;
        req->uc_flags = 0;
@@ -705,7 +713,7 @@ static int coda_upcall(struct venus_comm *vcp,
         * ENODEV.  */
        /* Go to sleep.  Wake up on signals only after the timeout. */
-        coda_waitfor_upcall(req);
+        coda_waitfor_upcall(vcp, req);
        /* Op went through, interrupt or not... */
        if (req->uc_flags & CODA_REQ_WRITE) {
@@ -759,6 +767,7 @@ static int coda_upcall(struct venus_comm *vcp,
 exit:
        kfree(req);
+        mutex_unlock(&vcp->vc_mutex);
        return error;
 }
@@ -796,21 +805,24 @@ exit:
 *
 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
-int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
 {
        struct inode *inode = NULL;
-        struct CodaFid *fid, *newfid;
+        struct CodaFid *fid = NULL, *newfid;
+        struct super_block *sb;
        /* Handle invalidation requests. */
-        if ( !sb || !sb->s_root)
+        mutex_lock(&vcp->vc_mutex);
-                return 0;
+        sb = vcp->vc_sb;
+        if (!sb || !sb->s_root)
+                goto unlock_out;
        switch (opcode) {
        case CODA_FLUSH:
                coda_cache_clear_all(sb);
                shrink_dcache_sb(sb);
                if (sb->s_root->d_inode)
-                    coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
+                        coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
                break;
        case CODA_PURGEUSER:
@@ -819,45 +831,53 @@ int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
        case CODA_ZAPDIR:
                fid = &out->coda_zapdir.CodaFid;
-                inode = coda_fid_to_inode(fid, sb);
-                if (inode) {
-                        coda_flag_inode_children(inode, C_PURGE);
-                        coda_flag_inode(inode, C_VATTR);
-                }
                break;
        case CODA_ZAPFILE:
                fid = &out->coda_zapfile.CodaFid;
-                inode = coda_fid_to_inode(fid, sb);
-                if (inode)
-                        coda_flag_inode(inode, C_VATTR);
                break;
        case CODA_PURGEFID:
                fid = &out->coda_purgefid.CodaFid;
+                break;
+        case CODA_REPLACE:
+                fid = &out->coda_replace.OldFid;
+                break;
+        }
+        if (fid)
                inode = coda_fid_to_inode(fid, sb);
-                if (inode) {
-                        coda_flag_inode_children(inode, C_PURGE);
-                        /* catch the dentries later if some are still busy */
+unlock_out:
-                        coda_flag_inode(inode, C_PURGE);
+        mutex_unlock(&vcp->vc_mutex);
-                        d_prune_aliases(inode);
-                }
+        if (!inode)
+                return 0;
+        switch (opcode) {
+        case CODA_ZAPDIR:
+                coda_flag_inode_children(inode, C_PURGE);
+                coda_flag_inode(inode, C_VATTR);
+                break;
+        case CODA_ZAPFILE:
+                coda_flag_inode(inode, C_VATTR);
+                break;
+        case CODA_PURGEFID:
+                coda_flag_inode_children(inode, C_PURGE);
+                /* catch the dentries later if some are still busy */
+                coda_flag_inode(inode, C_PURGE);
+                d_prune_aliases(inode);
                break;
        case CODA_REPLACE:
-                fid = &out->coda_replace.OldFid;
                newfid = &out->coda_replace.NewFid;
-                inode = coda_fid_to_inode(fid, sb);
+                coda_replace_fid(inode, fid, newfid);
-                if (inode)
-                        coda_replace_fid(inode, fid, newfid);
                break;
        }
+        iput(inode);
-        if (inode)
-                iput(inode);
        return 0;
 }
diff --git a/fs/compat.c b/fs/compat.c
index 0644a154672b..f6fd0a00e6cc 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -29,8 +29,6 @@
 #include <linux/vfs.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
-#include <linux/smb.h>
-#include <linux/smb_mount.h>
 #include <linux/ncp_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/syscalls.h>
@@ -51,6 +49,7 @@
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -258,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 }
 /*
- * The following statfs calls are copies of code from fs/open.c and
+ * The following statfs calls are copies of code from fs/statfs.c and
 * should be checked against those from time to time
 */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
@@ -321,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
            __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
            __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
            __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
-            __put_user(kbuf->f_frsize, &ubuf->f_frsize))
+            __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+            __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+            __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
                return -EFAULT;
        return 0;
 }
@@ -598,24 +599,22 @@ ssize_t compat_rw_copy_check_uvector(int type,
        if (nr_segs > fast_segs) {
                ret = -ENOMEM;
                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-                if (iov == NULL) {
+                if (iov == NULL)
-                        *ret_pointer = fast_pointer;
                        goto out;
-                }
        }
        *ret_pointer = iov;
        /*
         * Single unix specification:
         * We should -EINVAL if an element length is not >= 0 and fitting an
-         * ssize_t.  The total length is fitting an ssize_t
+         * ssize_t.
         *
-         * Be careful here because iov_len is a size_t not an ssize_t
+         * In Linux, the total length is limited to MAX_RW_COUNT, there is
+         * no overflow possibility.
         */
        tot_len = 0;
        ret = -EINVAL;
        for (seg = 0; seg < nr_segs; seg++) {
-                compat_ssize_t tmp = tot_len;
                compat_uptr_t buf;
                compat_ssize_t len;
@@ -626,13 +625,13 @@ ssize_t compat_rw_copy_check_uvector(int type,
                }
                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
                        goto out;
-                tot_len += len;
-                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-                        goto out;
                if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
                        ret = -EFAULT;
                        goto out;
                }
+                if (len > MAX_RW_COUNT - tot_len)
+                        len = MAX_RW_COUNT - tot_len;
+                tot_len += len;
                iov->iov_base = compat_ptr(buf);
                iov->iov_len = (compat_size_t) len;
                uvector++;
@@ -745,30 +744,6 @@ static void *do_ncp_super_data_conv(void *raw_data)
        return raw_data;
 }
-struct compat_smb_mount_data {
-        compat_int_t version;
-        __compat_uid_t mounted_uid;
-        __compat_uid_t uid;
-        __compat_gid_t gid;
-        compat_mode_t file_mode;
-        compat_mode_t dir_mode;
-};
-static void *do_smb_super_data_conv(void *raw_data)
-{
-        struct smb_mount_data *s = raw_data;
-        struct compat_smb_mount_data *c_s = raw_data;
-        if (c_s->version != SMB_MOUNT_OLDVERSION)
-                goto out;
-        s->dir_mode = c_s->dir_mode;
-        s->file_mode = c_s->file_mode;
-        s->gid = c_s->gid;
-        s->uid = c_s->uid;
-        s->mounted_uid = c_s->mounted_uid;
- out:
-        return raw_data;
-}
 struct compat_nfs_string {
        compat_uint_t len;
@@ -835,7 +810,6 @@ static int do_nfs4_super_data_conv(void *raw_data)
        return 0;
 }
-#define SMBFS_NAME      "smbfs"
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME       "nfs4"
@@ -870,9 +844,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
        retval = -EINVAL;
        if (kernel_type && data_page) {
-                if (!strcmp(kernel_type, SMBFS_NAME)) {
+                if (!strcmp(kernel_type, NCPFS_NAME)) {
-                        do_smb_super_data_conv((void *)data_page);
-                } else if (!strcmp(kernel_type, NCPFS_NAME)) {
                        do_ncp_super_data_conv((void *)data_page);
                } else if (!strcmp(kernel_type, NFS4_NAME)) {
                        if (do_nfs4_super_data_conv((void *) data_page))
@@ -1378,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max)
                        argv++;
                        if (i++ >= max)
                                return -E2BIG;
+                        if (fatal_signal_pending(current))
+                                return -ERESTARTNOHAND;
+                        cond_resched();
                }
        }
        return i;
@@ -1419,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
                while (len > 0) {
                        int offset, bytes_to_copy;
+                        if (fatal_signal_pending(current)) {
+                                ret = -ERESTARTNOHAND;
+                                goto out;
+                        }
+                        cond_resched();
                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;
@@ -1435,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                                struct page *page;
-#ifdef CONFIG_STACK_GROWSUP
+                                page = get_arg_page(bprm, pos, 1);
-                                ret = expand_stack_downwards(bprm->vma, pos);
+                                if (!page) {
-                                if (ret < 0) {
-                                        /* We've exceed the stack rlimit. */
-                                        ret = -E2BIG;
-                                        goto out;
-                                }
-#endif
-                                ret = get_user_pages(current, bprm->mm, pos,
-                                                     1, 1, 1, &page, NULL);
-                                if (ret <= 0) {
-                                        /* We've exceed the stack rlimit. */
                                        ret = -E2BIG;
                                        goto out;
                                }
@@ -1567,8 +1539,10 @@ int compat_do_execve(char * filename,
        return retval;
 out:
-        if (bprm->mm)
+        if (bprm->mm) {
+                acct_arg_size(bprm, 0);
                mmput(bprm->mm);
+        }
 out_file:
        if (bprm->file) {
@@ -1963,7 +1937,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
-#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
+#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
 /* Stuff for NFS server syscalls... */
 struct compat_nfsctl_svc {
        u16                     svc32_port;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 03e59aa318eb..61abb638b4bf 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,7 +19,6 @@
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
@@ -43,10 +42,9 @@
 #include <linux/tty.h>
 #include <linux/vt_kern.h>
 #include <linux/fb.h>
-#include <linux/videodev.h>
+#include <linux/videodev2.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
-#include <linux/smb_fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/rtc.h>
@@ -558,25 +556,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 #endif /* CONFIG_BLOCK */
-static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
-                        compat_uid_t __user *argp)
-{
-        mm_segment_t old_fs = get_fs();
-        __kernel_uid_t kuid;
-        int err;
-        cmd = SMB_IOC_GETMOUNTUID;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long)&kuid);
-        set_fs(old_fs);
-        if (err >= 0)
-                err = put_user(kuid, argp);
-        return err;
-}
 /* Bluetooth ioctls */
 #define HCIUARTSETPROTO         _IOW('U', 200, int)
 #define HCIUARTGETPROTO         _IOR('U', 201, int)
@@ -599,69 +578,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
 #define HIDPGETCONNLIST _IOR('H', 210, int)
 #define HIDPGETCONNINFO _IOR('H', 211, int)
-#ifdef CONFIG_BLOCK
-struct raw32_config_request
-{
-        compat_int_t    raw_minor;
-        __u64   block_major;
-        __u64   block_minor;
-} __attribute__((packed));
-static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
-{
-        int ret;
-        if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request)))
-                return -EFAULT;
-        ret = __get_user(req->raw_minor, &user_req->raw_minor);
-        ret |= __get_user(req->block_major, &user_req->block_major);
-        ret |= __get_user(req->block_minor, &user_req->block_minor);
-        return ret ? -EFAULT : 0;
-}
-static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
-{
-        int ret;
-        if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request)))
-                return -EFAULT;
-        ret = __put_user(req->raw_minor, &user_req->raw_minor);
-        ret |= __put_user(req->block_major, &user_req->block_major);
-        ret |= __put_user(req->block_minor, &user_req->block_minor);
-        return ret ? -EFAULT : 0;
-}
-static int raw_ioctl(unsigned fd, unsigned cmd,
-                struct raw32_config_request __user *user_req)
-{
-        int ret;
-        switch (cmd) {
-        case RAW_SETBIND:
-        default: {      /* RAW_GETBIND */
-                struct raw_config_request req;
-                mm_segment_t oldfs = get_fs();
-                if ((ret = get_raw32_request(&req, user_req)))
-                        return ret;
-                set_fs(KERNEL_DS);
-                ret = sys_ioctl(fd,cmd,(unsigned long)&req);
-                set_fs(oldfs);
-                if ((!ret) && (cmd == RAW_GETBIND)) {
-                        ret = set_raw32_request(&req, user_req);
-                }
-                break;
-        }
-        }
-        return ret;
-}
-#endif /* CONFIG_BLOCK */
 struct serial_struct32 {
        compat_int_t    type;
@@ -920,6 +836,7 @@ COMPATIBLE_IOCTL(TCSETSW)
 COMPATIBLE_IOCTL(TCSETSF)
 COMPATIBLE_IOCTL(TIOCLINUX)
 COMPATIBLE_IOCTL(TIOCSBRK)
+COMPATIBLE_IOCTL(TIOCGDEV)
 COMPATIBLE_IOCTL(TIOCCBRK)
 COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
@@ -1265,8 +1182,6 @@ COMPATIBLE_IOCTL(OSS_GETVERSION)
 /* Raw devices */
 COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
-/* SMB ioctls which do not need any translations */
-COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
 /* Watchdog */
 COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -1523,15 +1438,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
        case MTIOCGET32:
        case MTIOCPOS32:
                return mt_ioctl_trans(fd, cmd, argp);
-        /* Raw devices */
-        case RAW_SETBIND:
-        case RAW_GETBIND:
-                return raw_ioctl(fd, cmd, argp);
 #endif
-        /* One SMB ioctl needs translations. */
-#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
-        case SMB_IOC_GETMOUNTUID_32:
-                return do_smb_getmountuid(fd, cmd, argp);
        /* Serial */
        case TIOCGSERIAL:
        case TIOCSSERIAL:
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 13587cc97a0b..9febcdefdfdc 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,8 +1,8 @@
 config CONFIGFS_FS
        tristate "Userspace-driven configuration filesystem"
-        depends on SYSFS
+        select SYSFS
        help
-          configfs is a ram-based filesystem that provides the converse
+          configfs is a RAM-based filesystem that provides the converse
          of sysfs's functionality. Where sysfs is a filesystem-based
          view of kernel objects, configfs is a filesystem-based manager
          of kernel objects, or config_items.
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da6061a6df40..82bda8fdfc1c 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -90,6 +90,7 @@ extern const struct file_operations configfs_file_operations;
 extern const struct file_operations bin_fops;
 extern const struct inode_operations configfs_dir_inode_operations;
 extern const struct inode_operations configfs_symlink_inode_operations;
+extern const struct dentry_operations configfs_dentry_ops;
 extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
                            const char *symname);
@@ -120,7 +121,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 {
        struct config_item * item = NULL;
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        if (!d_unhashed(dentry)) {
                struct configfs_dirent * sd = dentry->d_fsdata;
                if (sd->s_type & CONFIGFS_ITEM_LINK) {
@@ -129,7 +130,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
                } else
                        item = config_item_get(sd->s_element);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return item;
 }
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 0b502f80c691..90ff3cb10de3 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,12 +67,12 @@ static void configfs_d_iput(struct dentry * dentry,
 * We _must_ delete our dentries on last dput, as the chain-to-parent
 * behavior is required to clear the parents of default_groups.
 */
-static int configfs_d_delete(struct dentry *dentry)
+static int configfs_d_delete(const struct dentry *dentry)
 {
        return 1;
 }
-static const struct dentry_operations configfs_dentry_ops = {
+const struct dentry_operations configfs_dentry_ops = {
        .d_iput         = configfs_d_iput,
        /* simple_delete_dentry() isn't exported */
        .d_delete       = configfs_d_delete,
@@ -232,10 +232,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
        sd->s_mode = mode;
        sd->s_dentry = dentry;
-        if (dentry) {
+        if (dentry)
                dentry->d_fsdata = configfs_get(sd);
-                dentry->d_op = &configfs_dentry_ops;
-        }
        return 0;
 }
@@ -278,7 +276,6 @@ static int create_dir(struct config_item * k, struct dentry * p,
                error = configfs_create(d, mode, init_dir);
                if (!error) {
                        inc_nlink(p->d_inode);
-                        (d)->d_op = &configfs_dentry_ops;
                } else {
                        struct configfs_dirent *sd = d->d_fsdata;
                        if (sd) {
@@ -371,9 +368,7 @@ int configfs_create_link(struct configfs_symlink *sl,
                                   CONFIGFS_ITEM_LINK);
        if (!err) {
                err = configfs_create(dentry, mode, init_symlink);
-                if (!err)
+                if (err) {
-                        dentry->d_op = &configfs_dentry_ops;
-                else {
                        struct configfs_dirent *sd = dentry->d_fsdata;
                        if (sd) {
                                spin_lock(&configfs_dirent_lock);
@@ -399,8 +394,7 @@ static void remove_dir(struct dentry * d)
        if (d->d_inode)
                simple_rmdir(parent->d_inode,d);
-        pr_debug(" o %s removing done (%d)\n",d->d_name.name,
+        pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
-                 atomic_read(&d->d_count));
        dput(parent);
 }
@@ -448,7 +442,6 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
                return error;
        }
-        dentry->d_op = &configfs_dentry_ops;
        d_rehash(dentry);
        return 0;
@@ -493,7 +486,10 @@ static struct dentry * configfs_lookup(struct inode *dir,
                 * If it doesn't exist and it isn't a NOT_PINNED item,
                 * it must be negative.
                 */
-                return simple_lookup(dir, dentry, nd);
+                if (dentry->d_name.len > NAME_MAX)
+                        return ERR_PTR(-ENAMETOOLONG);
+                d_add(dentry, NULL);
+                return NULL;
        }
 out:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44a8d6a..c83f4768eeaa 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
        struct inode * inode = new_inode(configfs_sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mapping->a_ops = &configfs_aops;
                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
@@ -249,18 +250,14 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
        struct dentry * dentry = sd->s_dentry;
        if (dentry) {
-                spin_lock(&dcache_lock);
                spin_lock(&dentry->d_lock);
                if (!(d_unhashed(dentry) && dentry->d_inode)) {
-                        dget_locked(dentry);
+                        dget_dlock(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
                        simple_unlink(parent->d_inode, dentry);
-                } else {
+                } else
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
-                }
        }
 }
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8c8d64230c2d..ecc62178beda 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -101,19 +101,20 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
        configfs_root_group.cg_item.ci_dentry = root;
        root->d_fsdata = &configfs_root;
        sb->s_root = root;
+        sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
        return 0;
 }
-static int configfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
+        return mount_single(fs_type, flags, data, configfs_fill_super);
 }
 static struct file_system_type configfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "configfs",
-        .get_sb         = configfs_get_sb,
+        .mount          = configfs_do_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 1e7a33028d33..e141939080f0 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -34,57 +34,81 @@ static const struct address_space_operations cramfs_aops;
 static DEFINE_MUTEX(read_mutex);
-/* These two macros may change in future, to provide better st_ino
+/* These macros may change in future, to provide better st_ino semantics. */
-   semantics. */
-#define CRAMINO(x)      (((x)->offset && (x)->size)?(x)->offset<<2:1)
 #define OFFSET(x)       ((x)->i_ino)
-static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode)
+static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
 {
+        if (!cino->offset)
+                return offset + 1;
+        if (!cino->size)
+                return offset + 1;
+        /*
+         * The file mode test fixes buggy mkcramfs implementations where
+         * cramfs_inode->offset is set to a non zero value for entries
+         * which did not contain data, like devices node and fifos.
+         */
+        switch (cino->mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                return cino->offset << 2;
+        default:
+                break;
+        }
+        return offset + 1;
+}
+static struct inode *get_cramfs_inode(struct super_block *sb,
+        struct cramfs_inode *cramfs_inode, unsigned int offset)
+{
+        struct inode *inode;
        static struct timespec zerotime;
+        inode = iget_locked(sb, cramino(cramfs_inode, offset));
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        switch (cramfs_inode->mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_fop = &generic_ro_fops;
+                inode->i_data.a_ops = &cramfs_aops;
+                break;
+        case S_IFDIR:
+                inode->i_op = &cramfs_dir_inode_operations;
+                inode->i_fop = &cramfs_directory_operations;
+                break;
+        case S_IFLNK:
+                inode->i_op = &page_symlink_inode_operations;
+                inode->i_data.a_ops = &cramfs_aops;
+                break;
+        default:
+                init_special_inode(inode, cramfs_inode->mode,
+                                old_decode_dev(cramfs_inode->size));
+        }
        inode->i_mode = cramfs_inode->mode;
        inode->i_uid = cramfs_inode->uid;
-        inode->i_size = cramfs_inode->size;
-        inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
        inode->i_gid = cramfs_inode->gid;
+        /* if the lower 2 bits are zero, the inode contains data */
+        if (!(inode->i_ino & 3)) {
+                inode->i_size = cramfs_inode->size;
+                inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+        }
        /* Struct copy intentional */
        inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
        /* inode->i_nlink is left 1 - arguably wrong for directories,
           but it's the best we can do without reading the directory
           contents.  1 yields the right result in GNU find, even
           without -noleaf option. */
-        if (S_ISREG(inode->i_mode)) {
-                inode->i_fop = &generic_ro_fops;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &cramfs_dir_inode_operations;
-                inode->i_fop = &cramfs_directory_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                inode->i_op = &page_symlink_inode_operations;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else {
-                init_special_inode(inode, inode->i_mode,
-                        old_decode_dev(cramfs_inode->size));
-        }
-}
-static struct inode *get_cramfs_inode(struct super_block *sb,
+        unlock_new_inode(inode);
-                                struct cramfs_inode * cramfs_inode)
-{
-        struct inode *inode;
-        if (CRAMINO(cramfs_inode) == 1) {
-                inode = new_inode(sb);
-                if (inode) {
-                        inode->i_ino = 1;
-                        setup_inode(inode, cramfs_inode);
-                }
-        } else {
-                inode = iget_locked(sb, CRAMINO(cramfs_inode));
-                if (inode && (inode->i_state & I_NEW)) {
-                        setup_inode(inode, cramfs_inode);
-                        unlock_new_inode(inode);
-                }
-        }
        return inode;
 }
@@ -265,6 +289,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
                printk(KERN_ERR "cramfs: root is not a directory\n");
                goto out;
        }
+        /* correct strange, hard-coded permissions of mkcramfs */
+        super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
        root_offset = super.root.offset << 2;
        if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
                sbi->size=super.size;
@@ -289,7 +316,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
        /* Set it all up.. */
        sb->s_op = &cramfs_ops;
-        root = get_cramfs_inode(sb, &super.root);
+        root = get_cramfs_inode(sb, &super.root, 0);
        if (!root)
                goto out;
        sb->s_root = d_alloc_root(root);
@@ -365,7 +392,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                 */
                namelen = de->namelen << 2;
                memcpy(buf, name, namelen);
-                ino = CRAMINO(de);
+                ino = cramino(de, OFFSET(inode) + offset);
                mode = de->mode;
                mutex_unlock(&read_mutex);
                nextoffset = offset + sizeof(*de) + namelen;
@@ -404,8 +431,9 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
                struct cramfs_inode *de;
                char *name;
                int namelen, retval;
+                int dir_off = OFFSET(dir) + offset;
-                de = cramfs_read(dir->i_sb, OFFSET(dir) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
+                de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
                name = (char *)(de+1);
                /* Try to take advantage of sorted directories */
@@ -436,7 +464,7 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
                if (!retval) {
                        struct cramfs_inode entry = *de;
                        mutex_unlock(&read_mutex);
-                        d_add(dentry, get_cramfs_inode(dir->i_sb, &entry));
+                        d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off));
                        return NULL;
                }
                /* else (retval < 0) */
@@ -533,17 +561,16 @@ static const struct super_operations cramfs_ops = {
        .statfs         = cramfs_statfs,
 };
-static int cramfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *cramfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
-                           mnt);
 }
 static struct file_system_type cramfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "cramfs",
-        .get_sb         = cramfs_get_sb,
+        .mount          = cramfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/dcache.c b/fs/dcache.c
index 83293be48149..2a6bd9a4ae97 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -33,20 +33,58 @@
 #include <linux/bootmem.h>
 #include <linux/fs_struct.h>
 #include <linux/hardirq.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rculist_bl.h>
 #include "internal.h"
+/*
+ * Usage:
+ * dcache->d_inode->i_lock protects:
+ *   - i_dentry, d_alias, d_inode of aliases
+ * dcache_hash_bucket lock protects:
+ *   - the dcache hash table
+ * s_anon bl list spinlock protects:
+ *   - the s_anon list (see __d_drop)
+ * dcache_lru_lock protects:
+ *   - the dcache lru lists and counters
+ * d_lock protects:
+ *   - d_flags
+ *   - d_name
+ *   - d_lru
+ *   - d_count
+ *   - d_unhashed()
+ *   - d_parent and d_subdirs
+ *   - childrens' d_child and d_parent
+ *   - d_alias, d_inode
+ *
+ * Ordering:
+ * dentry->d_inode->i_lock
+ *   dentry->d_lock
+ *     dcache_lru_lock
+ *     dcache_hash_bucket lock
+ *     s_anon lock
+ *
+ * If there is an ancestor relationship:
+ * dentry->d_parent->...->d_parent->d_lock
+ *   ...
+ *     dentry->d_parent->d_lock
+ *       dentry->d_lock
+ *
+ * If no ancestor relationship:
+ * if (dentry1 < dentry2)
+ *   dentry1->d_lock
+ *     dentry2->d_lock
+ */
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
-EXPORT_SYMBOL(dcache_lock);
+EXPORT_SYMBOL(rename_lock);
 static struct kmem_cache *dentry_cache __read_mostly;
-#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
 /*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
@@ -60,56 +98,111 @@ static struct kmem_cache *dentry_cache __read_mostly;
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
-static struct hlist_head *dentry_hashtable __read_mostly;
+struct dcache_hash_bucket {
+        struct hlist_bl_head head;
+};
+static struct dcache_hash_bucket *dentry_hashtable __read_mostly;
+static inline struct dcache_hash_bucket *d_hash(struct dentry *parent,
+                                        unsigned long hash)
+{
+        hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
+        hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
+        return dentry_hashtable + (hash & D_HASHMASK);
+}
+static inline void spin_lock_bucket(struct dcache_hash_bucket *b)
+{
+        bit_spin_lock(0, (unsigned long *)&b->head.first);
+}
+static inline void spin_unlock_bucket(struct dcache_hash_bucket *b)
+{
+        __bit_spin_unlock(0, (unsigned long *)&b->head.first);
+}
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
 };
-static void __d_free(struct dentry *dentry)
+static DEFINE_PER_CPU(unsigned int, nr_dentry);
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+static int get_nr_dentry(void)
+{
+        int i;
+        int sum = 0;
+        for_each_possible_cpu(i)
+                sum += per_cpu(nr_dentry, i);
+        return sum < 0 ? 0 : sum;
+}
+int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
+                   size_t *lenp, loff_t *ppos)
 {
+        dentry_stat.nr_dentry = get_nr_dentry();
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+static void __d_free(struct rcu_head *head)
+{
+        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
        WARN_ON(!list_empty(&dentry->d_alias));
        if (dname_external(dentry))
                kfree(dentry->d_name.name);
        kmem_cache_free(dentry_cache, dentry); 
 }
-static void d_callback(struct rcu_head *head)
-{
-        struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
-        __d_free(dentry);
-}
 /*
- * no dcache_lock, please.  The caller must decrement dentry_stat.nr_dentry
+ * no locks, please.
- * inside dcache_lock.
 */
 static void d_free(struct dentry *dentry)
 {
+        BUG_ON(dentry->d_count);
+        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
        /* if dentry was never inserted into hash, immediate free is OK */
-        if (hlist_unhashed(&dentry->d_hash))
+        if (hlist_bl_unhashed(&dentry->d_hash))
-                __d_free(dentry);
+                __d_free(&dentry->d_u.d_rcu);
        else
-                call_rcu(&dentry->d_u.d_rcu, d_callback);
+                call_rcu(&dentry->d_u.d_rcu, __d_free);
+}
+/**
+ * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * @dentry: the target dentry
+ * After this call, in-progress rcu-walk path lookup will fail. This
+ * should be called after unhashing, and after changing d_inode (if
+ * the dentry has not already been unhashed).
+ */
+static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+{
+        assert_spin_locked(&dentry->d_lock);
+        /* Go through a barrier */
+        write_seqcount_barrier(&dentry->d_seq);
 }
 /*
 * Release the dentry's inode, using the filesystem
- * d_iput() operation if defined.
+ * d_iput() operation if defined. Dentry has no refcount
+ * and is unhashed.
 */
 static void dentry_iput(struct dentry * dentry)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
+        __releases(dentry->d_inode->i_lock)
 {
        struct inode *inode = dentry->d_inode;
        if (inode) {
                dentry->d_inode = NULL;
                list_del_init(&dentry->d_alias);
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                if (!inode->i_nlink)
                        fsnotify_inoderemove(inode);
                if (dentry->d_op && dentry->d_op->d_iput)
@@ -118,69 +211,191 @@ static void dentry_iput(struct dentry * dentry)
                        iput(inode);
        } else {
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
        }
 }
 /*
- * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ * Release the dentry's inode, using the filesystem
+ * d_iput() operation if defined. dentry remains in-use.
+ */
+static void dentry_unlink_inode(struct dentry * dentry)
+        __releases(dentry->d_lock)
+        __releases(dentry->d_inode->i_lock)
+{
+        struct inode *inode = dentry->d_inode;
+        dentry->d_inode = NULL;
+        list_del_init(&dentry->d_alias);
+        dentry_rcuwalk_barrier(dentry);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&inode->i_lock);
+        if (!inode->i_nlink)
+                fsnotify_inoderemove(inode);
+        if (dentry->d_op && dentry->d_op->d_iput)
+                dentry->d_op->d_iput(dentry, inode);
+        else
+                iput(inode);
+}
+/*
+ * dentry_lru_(add|del|move_tail) must be called with d_lock held.
 */
 static void dentry_lru_add(struct dentry *dentry)
 {
-        list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+        if (list_empty(&dentry->d_lru)) {
-        dentry->d_sb->s_nr_dentry_unused++;
+                spin_lock(&dcache_lru_lock);
-        dentry_stat.nr_unused++;
+                list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+                dentry->d_sb->s_nr_dentry_unused++;
+                dentry_stat.nr_unused++;
+                spin_unlock(&dcache_lru_lock);
+        }
 }
-static void dentry_lru_add_tail(struct dentry *dentry)
+static void __dentry_lru_del(struct dentry *dentry)
 {
-        list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+        list_del_init(&dentry->d_lru);
-        dentry->d_sb->s_nr_dentry_unused++;
+        dentry->d_sb->s_nr_dentry_unused--;
-        dentry_stat.nr_unused++;
+        dentry_stat.nr_unused--;
 }
 static void dentry_lru_del(struct dentry *dentry)
 {
        if (!list_empty(&dentry->d_lru)) {
-                list_del(&dentry->d_lru);
+                spin_lock(&dcache_lru_lock);
-                dentry->d_sb->s_nr_dentry_unused--;
+                __dentry_lru_del(dentry);
-                dentry_stat.nr_unused--;
+                spin_unlock(&dcache_lru_lock);
        }
 }
-static void dentry_lru_del_init(struct dentry *dentry)
+static void dentry_lru_move_tail(struct dentry *dentry)
 {
-        if (likely(!list_empty(&dentry->d_lru))) {
+        spin_lock(&dcache_lru_lock);
-                list_del_init(&dentry->d_lru);
+        if (list_empty(&dentry->d_lru)) {
-                dentry->d_sb->s_nr_dentry_unused--;
+                list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-                dentry_stat.nr_unused--;
+                dentry->d_sb->s_nr_dentry_unused++;
+                dentry_stat.nr_unused++;
+        } else {
+                list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
        }
+        spin_unlock(&dcache_lru_lock);
 }
 /**
 * d_kill - kill dentry and return parent
 * @dentry: dentry to kill
+ * @parent: parent dentry
 *
 * The dentry must already be unhashed and removed from the LRU.
 *
 * If this is the root of the dentry tree, return NULL.
+ *
+ * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
+ * d_kill.
 */
-static struct dentry *d_kill(struct dentry *dentry)
+static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
+        __releases(parent->d_lock)
+        __releases(dentry->d_inode->i_lock)
 {
-        struct dentry *parent;
+        dentry->d_parent = NULL;
        list_del(&dentry->d_u.d_child);
-        dentry_stat.nr_dentry--;        /* For d_free, below */
+        if (parent)
-        /*drops the locks, at that point nobody can reach this dentry */
+                spin_unlock(&parent->d_lock);
        dentry_iput(dentry);
+        /*
+         * dentry_iput drops the locks, at which point nobody (except
+         * transient RCU lookups) can reach this dentry.
+         */
+        d_free(dentry);
+        return parent;
+}
+/**
+ * d_drop - drop a dentry
+ * @dentry: dentry to drop
+ *
+ * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
+ * be found through a VFS lookup any more. Note that this is different from
+ * deleting the dentry - d_delete will try to mark the dentry negative if
+ * possible, giving a successful _negative_ lookup, while d_drop will
+ * just make the cache lookup fail.
+ *
+ * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
+ * reason (NFS timeouts or autofs deletes).
+ *
+ * __d_drop requires dentry->d_lock.
+ */
+void __d_drop(struct dentry *dentry)
+{
+        if (!(dentry->d_flags & DCACHE_UNHASHED)) {
+                if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) {
+                        bit_spin_lock(0,
+                                (unsigned long *)&dentry->d_sb->s_anon.first);
+                        dentry->d_flags |= DCACHE_UNHASHED;
+                        hlist_bl_del_init(&dentry->d_hash);
+                        __bit_spin_unlock(0,
+                                (unsigned long *)&dentry->d_sb->s_anon.first);
+                } else {
+                        struct dcache_hash_bucket *b;
+                        b = d_hash(dentry->d_parent, dentry->d_name.hash);
+                        spin_lock_bucket(b);
+                        /*
+                         * We may not actually need to put DCACHE_UNHASHED
+                         * manipulations under the hash lock, but follow
+                         * the principle of least surprise.
+                         */
+                        dentry->d_flags |= DCACHE_UNHASHED;
+                        hlist_bl_del_rcu(&dentry->d_hash);
+                        spin_unlock_bucket(b);
+                        dentry_rcuwalk_barrier(dentry);
+                }
+        }
+}
+EXPORT_SYMBOL(__d_drop);
+void d_drop(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __d_drop(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_drop);
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
+        __releases(dentry->d_lock)
+{
+        struct inode *inode;
+        struct dentry *parent;
+        inode = dentry->d_inode;
+        if (inode && !spin_trylock(&inode->i_lock)) {
+relock:
+                spin_unlock(&dentry->d_lock);
+                cpu_relax();
+                return dentry; /* try again with same dentry */
+        }
        if (IS_ROOT(dentry))
                parent = NULL;
        else
                parent = dentry->d_parent;
-        d_free(dentry);
+        if (parent && !spin_trylock(&parent->d_lock)) {
-        return parent;
+                if (inode)
+                        spin_unlock(&inode->i_lock);
+                goto relock;
+        }
+        if (ref)
+                dentry->d_count--;
+        /* if dentry was on the d_lru list delete it from there */
+        dentry_lru_del(dentry);
+        /* if it was on the hash then remove it */
+        __d_drop(dentry);
+        return d_kill(dentry, parent);
 }
 /* 
@@ -208,52 +423,42 @@ static struct dentry *d_kill(struct dentry *dentry)
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
- *
- * no dcache lock, please.
 */
 void dput(struct dentry *dentry)
 {
        if (!dentry)
                return;
 repeat:
-        if (atomic_read(&dentry->d_count) == 1)
+        if (dentry->d_count == 1)
                might_sleep();
-        if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
-                return;
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count)) {
+        BUG_ON(!dentry->d_count);
+        if (dentry->d_count > 1) {
+                dentry->d_count--;
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
                return;
        }
-        /*
+        if (dentry->d_flags & DCACHE_OP_DELETE) {
-         * AV: ->d_delete() is _NOT_ allowed to block now.
-         */
-        if (dentry->d_op && dentry->d_op->d_delete) {
                if (dentry->d_op->d_delete(dentry))
-                        goto unhash_it;
+                        goto kill_it;
        }
        /* Unreachable? Get rid of it */
        if (d_unhashed(dentry))
                goto kill_it;
-        if (list_empty(&dentry->d_lru)) {
-                dentry->d_flags |= DCACHE_REFERENCED;
+        /* Otherwise leave it cached and ensure it's on the LRU */
-                dentry_lru_add(dentry);
+        dentry->d_flags |= DCACHE_REFERENCED;
-        }
+        dentry_lru_add(dentry);
-        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
+        dentry->d_count--;
+        spin_unlock(&dentry->d_lock);
        return;
-unhash_it:
-        __d_drop(dentry);
 kill_it:
-        /* if dentry was on the d_lru list delete it from there */
+        dentry = dentry_kill(dentry, 1);
-        dentry_lru_del(dentry);
-        dentry = d_kill(dentry);
        if (dentry)
                goto repeat;
 }
@@ -276,9 +481,9 @@ int d_invalidate(struct dentry * dentry)
        /*
         * If it's already been dropped, return OK.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        if (d_unhashed(dentry)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
                return 0;
        }
        /*
@@ -286,9 +491,9 @@ int d_invalidate(struct dentry * dentry)
         * to get rid of unused child entries.
         */
        if (!list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
                shrink_dcache_parent(dentry);
-                spin_lock(&dcache_lock);
+                spin_lock(&dentry->d_lock);
        }
        /*
@@ -301,36 +506,61 @@ int d_invalidate(struct dentry * dentry)
         * we might still populate it if it was a
         * working directory or similar).
         */
-        spin_lock(&dentry->d_lock);
+        if (dentry->d_count > 1) {
-        if (atomic_read(&dentry->d_count) > 1) {
                if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
                        return -EBUSY;
                }
        }
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        return 0;
 }
 EXPORT_SYMBOL(d_invalidate);
-/* This should be called _only_ with dcache_lock held */
+/* This must be called with d_lock held */
+static inline void __dget_dlock(struct dentry *dentry)
+{
+        dentry->d_count++;
+}
-static inline struct dentry * __dget_locked(struct dentry *dentry)
+static inline void __dget(struct dentry *dentry)
 {
-        atomic_inc(&dentry->d_count);
+        spin_lock(&dentry->d_lock);
-        dentry_lru_del_init(dentry);
+        __dget_dlock(dentry);
-        return dentry;
+        spin_unlock(&dentry->d_lock);
 }
-struct dentry * dget_locked(struct dentry *dentry)
+struct dentry *dget_parent(struct dentry *dentry)
 {
-        return __dget_locked(dentry);
+        struct dentry *ret;
+repeat:
+        /*
+         * Don't need rcu_dereference because we re-check it was correct under
+         * the lock.
+         */
+        rcu_read_lock();
+        ret = dentry->d_parent;
+        if (!ret) {
+                rcu_read_unlock();
+                goto out;
+        }
+        spin_lock(&ret->d_lock);
+        if (unlikely(ret != dentry->d_parent)) {
+                spin_unlock(&ret->d_lock);
+                rcu_read_unlock();
+                goto repeat;
+        }
+        rcu_read_unlock();
+        BUG_ON(!ret->d_count);
+        ret->d_count++;
+        spin_unlock(&ret->d_lock);
+out:
+        return ret;
 }
-EXPORT_SYMBOL(dget_locked);
+EXPORT_SYMBOL(dget_parent);
 /**
 * d_find_alias - grab a hashed alias of inode
@@ -348,42 +578,51 @@ EXPORT_SYMBOL(dget_locked);
 * any other hashed alias over that one unless @want_discon is set,
 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
 */
+static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
-static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
 {
-        struct list_head *head, *next, *tmp;
+        struct dentry *alias, *discon_alias;
-        struct dentry *alias, *discon_alias=NULL;
-        head = &inode->i_dentry;
+again:
-        next = inode->i_dentry.next;
+        discon_alias = NULL;
-        while (next != head) {
+        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
-                tmp = next;
+                spin_lock(&alias->d_lock);
-                next = tmp->next;
-                prefetch(next);
-                alias = list_entry(tmp, struct dentry, d_alias);
                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
                        if (IS_ROOT(alias) &&
-                            (alias->d_flags & DCACHE_DISCONNECTED))
+                            (alias->d_flags & DCACHE_DISCONNECTED)) {
                                discon_alias = alias;
-                        else if (!want_discon) {
+                        } else if (!want_discon) {
-                                __dget_locked(alias);
+                                __dget_dlock(alias);
+                                spin_unlock(&alias->d_lock);
+                                return alias;
+                        }
+                }
+                spin_unlock(&alias->d_lock);
+        }
+        if (discon_alias) {
+                alias = discon_alias;
+                spin_lock(&alias->d_lock);
+                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+                        if (IS_ROOT(alias) &&
+                            (alias->d_flags & DCACHE_DISCONNECTED)) {
+                                __dget_dlock(alias);
+                                spin_unlock(&alias->d_lock);
                                return alias;
                        }
                }
+                spin_unlock(&alias->d_lock);
+                goto again;
        }
-        if (discon_alias)
+        return NULL;
-                __dget_locked(discon_alias);
-        return discon_alias;
 }
-struct dentry * d_find_alias(struct inode *inode)
+struct dentry *d_find_alias(struct inode *inode)
 {
        struct dentry *de = NULL;
        if (!list_empty(&inode->i_dentry)) {
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                de = __d_find_alias(inode, 0);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
        }
        return de;
 }
@@ -397,132 +636,153 @@ void d_prune_aliases(struct inode *inode)
 {
        struct dentry *dentry;
 restart:
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
                spin_lock(&dentry->d_lock);
-                if (!atomic_read(&dentry->d_count)) {
+                if (!dentry->d_count) {
-                        __dget_locked(dentry);
+                        __dget_dlock(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        dput(dentry);
                        goto restart;
                }
                spin_unlock(&dentry->d_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(d_prune_aliases);
 /*
- * Throw away a dentry - free the inode, dput the parent.  This requires that
+ * Try to throw away a dentry - free the inode, dput the parent.
- * the LRU list has already been removed.
+ * Requires dentry->d_lock is held, and dentry->d_count == 0.
+ * Releases dentry->d_lock.
 *
- * Try to prune ancestors as well.  This is necessary to prevent
+ * This may fail if locks cannot be acquired no problem, just try again.
- * quadratic behavior of shrink_dcache_parent(), but is also expected
- * to be beneficial in reducing dentry cache fragmentation.
 */
-static void prune_one_dentry(struct dentry * dentry)
+static void try_prune_one_dentry(struct dentry *dentry)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
-        __acquires(dcache_lock)
 {
-        __d_drop(dentry);
+        struct dentry *parent;
-        dentry = d_kill(dentry);
+        parent = dentry_kill(dentry, 0);
        /*
-         * Prune ancestors.  Locking is simpler than in dput(),
+         * If dentry_kill returns NULL, we have nothing more to do.
-         * because dcache_lock needs to be taken anyway.
+         * if it returns the same dentry, trylocks failed. In either
+         * case, just loop again.
+         *
+         * Otherwise, we need to prune ancestors too. This is necessary
+         * to prevent quadratic behavior of shrink_dcache_parent(), but
+         * is also expected to be beneficial in reducing dentry cache
+         * fragmentation.
         */
-        spin_lock(&dcache_lock);
+        if (!parent)
+                return;
+        if (parent == dentry)
+                return;
+        /* Prune ancestors. */
+        dentry = parent;
        while (dentry) {
-                if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
+                spin_lock(&dentry->d_lock);
+                if (dentry->d_count > 1) {
+                        dentry->d_count--;
+                        spin_unlock(&dentry->d_lock);
                        return;
+                }
-                if (dentry->d_op && dentry->d_op->d_delete)
+                dentry = dentry_kill(dentry, 1);
-                        dentry->d_op->d_delete(dentry);
-                dentry_lru_del_init(dentry);
-                __d_drop(dentry);
-                dentry = d_kill(dentry);
-                spin_lock(&dcache_lock);
        }
 }
-/*
+static void shrink_dentry_list(struct list_head *list)
- * Shrink the dentry LRU on a given superblock.
- * @sb   : superblock to shrink dentry LRU.
- * @count: If count is NULL, we prune all dentries on superblock.
- * @flags: If flags is non-zero, we need to do special processing based on
- * which flags are set. This means we don't need to maintain multiple
- * similar copies of this loop.
- */
-static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 {
-        LIST_HEAD(referenced);
-        LIST_HEAD(tmp);
        struct dentry *dentry;
-        int cnt = 0;
-        BUG_ON(!sb);
-        BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
-        spin_lock(&dcache_lock);
-        if (count != NULL)
-                /* called from prune_dcache() and shrink_dcache_parent() */
-                cnt = *count;
-restart:
-        if (count == NULL)
-                list_splice_init(&sb->s_dentry_lru, &tmp);
-        else {
-                while (!list_empty(&sb->s_dentry_lru)) {
-                        dentry = list_entry(sb->s_dentry_lru.prev,
-                                        struct dentry, d_lru);
-                        BUG_ON(dentry->d_sb != sb);
-                        spin_lock(&dentry->d_lock);
+        rcu_read_lock();
-                        /*
+        for (;;) {
-                         * If we are honouring the DCACHE_REFERENCED flag and
+                dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
-                         * the dentry has this flag set, don't free it. Clear
+                if (&dentry->d_lru == list)
-                         * the flag and put it back on the LRU.
+                        break; /* empty */
-                         */
-                        if ((flags & DCACHE_REFERENCED)
-                                && (dentry->d_flags & DCACHE_REFERENCED)) {
-                                dentry->d_flags &= ~DCACHE_REFERENCED;
-                                list_move(&dentry->d_lru, &referenced);
-                                spin_unlock(&dentry->d_lock);
-                        } else {
-                                list_move_tail(&dentry->d_lru, &tmp);
-                                spin_unlock(&dentry->d_lock);
-                                cnt--;
-                                if (!cnt)
-                                        break;
-                        }
-                        cond_resched_lock(&dcache_lock);
-                }
-        }
-        while (!list_empty(&tmp)) {
-                dentry = list_entry(tmp.prev, struct dentry, d_lru);
-                dentry_lru_del_init(dentry);
                spin_lock(&dentry->d_lock);
+                if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
+                        spin_unlock(&dentry->d_lock);
+                        continue;
+                }
                /*
                 * We found an inuse dentry which was not removed from
                 * the LRU because of laziness during lookup.  Do not free
                 * it - just keep it off the LRU list.
                 */
-                if (atomic_read(&dentry->d_count)) {
+                if (dentry->d_count) {
+                        dentry_lru_del(dentry);
                        spin_unlock(&dentry->d_lock);
                        continue;
                }
-                prune_one_dentry(dentry);
-                /* dentry->d_lock was dropped in prune_one_dentry() */
+                rcu_read_unlock();
-                cond_resched_lock(&dcache_lock);
-        }
+                try_prune_one_dentry(dentry);
-        if (count == NULL && !list_empty(&sb->s_dentry_lru))
-                goto restart;
+                rcu_read_lock();
-        if (count != NULL)
+        }
-                *count = cnt;
+        rcu_read_unlock();
+}
+/**
+ * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
+ * @sb:         superblock to shrink dentry LRU.
+ * @count:      number of entries to prune
+ * @flags:      flags to control the dentry processing
+ *
+ * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ */
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+{
+        /* called from prune_dcache() and shrink_dcache_parent() */
+        struct dentry *dentry;
+        LIST_HEAD(referenced);
+        LIST_HEAD(tmp);
+        int cnt = *count;
+relock:
+        spin_lock(&dcache_lru_lock);
+        while (!list_empty(&sb->s_dentry_lru)) {
+                dentry = list_entry(sb->s_dentry_lru.prev,
+                                struct dentry, d_lru);
+                BUG_ON(dentry->d_sb != sb);
+                if (!spin_trylock(&dentry->d_lock)) {
+                        spin_unlock(&dcache_lru_lock);
+                        cpu_relax();
+                        goto relock;
+                }
+                /*
+                 * If we are honouring the DCACHE_REFERENCED flag and the
+                 * dentry has this flag set, don't free it.  Clear the flag
+                 * and put it back on the LRU.
+                 */
+                if (flags & DCACHE_REFERENCED &&
+                                dentry->d_flags & DCACHE_REFERENCED) {
+                        dentry->d_flags &= ~DCACHE_REFERENCED;
+                        list_move(&dentry->d_lru, &referenced);
+                        spin_unlock(&dentry->d_lock);
+                } else {
+                        list_move_tail(&dentry->d_lru, &tmp);
+                        spin_unlock(&dentry->d_lock);
+                        if (!--cnt)
+                                break;
+                }
+                cond_resched_lock(&dcache_lru_lock);
+        }
        if (!list_empty(&referenced))
                list_splice(&referenced, &sb->s_dentry_lru);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dcache_lru_lock);
+        shrink_dentry_list(&tmp);
+        *count = cnt;
 }
 /**
@@ -544,7 +804,6 @@ static void prune_dcache(int count)
        if (unused == 0 || count == 0)
                return;
-        spin_lock(&dcache_lock);
        if (count >= unused)
                prune_ratio = 1;
        else
@@ -581,11 +840,9 @@ static void prune_dcache(int count)
                if (down_read_trylock(&sb->s_umount)) {
                        if ((sb->s_root != NULL) &&
                            (!list_empty(&sb->s_dentry_lru))) {
-                                spin_unlock(&dcache_lock);
                                __shrink_dcache_sb(sb, &w_count,
                                                DCACHE_REFERENCED);
                                pruned -= w_count;
-                                spin_lock(&dcache_lock);
                        }
                        up_read(&sb->s_umount);
                }
@@ -601,20 +858,27 @@ static void prune_dcache(int count)
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
-        spin_unlock(&dcache_lock);
 }
 /**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
- * Shrink the dcache for the specified super block. This
+ * Shrink the dcache for the specified super block. This is used to free
- * is used to free the dcache before unmounting a file
+ * the dcache before unmounting a file system.
- * system
 */
-void shrink_dcache_sb(struct super_block * sb)
+void shrink_dcache_sb(struct super_block *sb)
 {
-        __shrink_dcache_sb(sb, NULL, 0);
+        LIST_HEAD(tmp);
+        spin_lock(&dcache_lru_lock);
+        while (!list_empty(&sb->s_dentry_lru)) {
+                list_splice_init(&sb->s_dentry_lru, &tmp);
+                spin_unlock(&dcache_lru_lock);
+                shrink_dentry_list(&tmp);
+                spin_lock(&dcache_lru_lock);
+        }
+        spin_unlock(&dcache_lru_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
@@ -631,10 +895,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
        BUG_ON(!IS_ROOT(dentry));
        /* detach this root from the system */
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
-        dentry_lru_del_init(dentry);
+        dentry_lru_del(dentry);
        __d_drop(dentry);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        for (;;) {
                /* descend to the first leaf in the current subtree */
@@ -643,14 +907,16 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                        /* this is a branch with children - detach all of them
                         * from the system in one go */
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
                        list_for_each_entry(loop, &dentry->d_subdirs,
                                            d_u.d_child) {
-                                dentry_lru_del_init(loop);
+                                spin_lock_nested(&loop->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
+                                dentry_lru_del(loop);
                                __d_drop(loop);
-                                cond_resched_lock(&dcache_lock);
+                                spin_unlock(&loop->d_lock);
                        }
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
                        /* move to the first child */
                        dentry = list_entry(dentry->d_subdirs.next,
@@ -662,7 +928,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                do {
                        struct inode *inode;
-                        if (atomic_read(&dentry->d_count) != 0) {
+                        if (dentry->d_count != 0) {
                                printk(KERN_ERR
                                       "BUG: Dentry %p{i=%lx,n=%s}"
                                       " still in use (%d)"
@@ -671,20 +937,23 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                                       dentry->d_inode ?
                                       dentry->d_inode->i_ino : 0UL,
                                       dentry->d_name.name,
-                                       atomic_read(&dentry->d_count),
+                                       dentry->d_count,
                                       dentry->d_sb->s_type->name,
                                       dentry->d_sb->s_id);
                                BUG();
                        }
-                        if (IS_ROOT(dentry))
+                        if (IS_ROOT(dentry)) {
                                parent = NULL;
-                        else {
+                                list_del(&dentry->d_u.d_child);
+                        } else {
                                parent = dentry->d_parent;
-                                atomic_dec(&parent->d_count);
+                                spin_lock(&parent->d_lock);
+                                parent->d_count--;
+                                list_del(&dentry->d_u.d_child);
+                                spin_unlock(&parent->d_lock);
                        }
-                        list_del(&dentry->d_u.d_child);
                        detached++;
                        inode = dentry->d_inode;
@@ -703,26 +972,18 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                         * otherwise we ascend to the parent and move to the
                         * next sibling if there is one */
                        if (!parent)
-                                goto out;
+                                return;
                        dentry = parent;
                } while (list_empty(&dentry->d_subdirs));
                dentry = list_entry(dentry->d_subdirs.next,
                                    struct dentry, d_u.d_child);
        }
-out:
-        /* several dentries were freed, need to correct nr_dentry */
-        spin_lock(&dcache_lock);
-        dentry_stat.nr_dentry -= detached;
-        spin_unlock(&dcache_lock);
 }
 /*
 * destroy the dentries attached to a superblock on unmounting
- * - we don't need to use dentry->d_lock, and only need dcache_lock when
+ * - we don't need to use dentry->d_lock because:
- *   removing the dentry from the system lists and hashes because:
 *   - the superblock is detached from all mountings and open files, so the
 *     dentry trees will not be rearranged by the VFS
 *   - s_umount is write-locked, so the memory pressure shrinker will ignore
@@ -739,11 +1000,13 @@ void shrink_dcache_for_umount(struct super_block *sb)
        dentry = sb->s_root;
        sb->s_root = NULL;
-        atomic_dec(&dentry->d_count);
+        spin_lock(&dentry->d_lock);
+        dentry->d_count--;
+        spin_unlock(&dentry->d_lock);
        shrink_dcache_for_umount_subtree(dentry);
-        while (!hlist_empty(&sb->s_anon)) {
+        while (!hlist_bl_empty(&sb->s_anon)) {
-                dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash);
+                dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
                shrink_dcache_for_umount_subtree(dentry);
        }
 }
@@ -761,15 +1024,20 @@ void shrink_dcache_for_umount(struct super_block *sb)
 * Return true if the parent or its subdirectories contain
 * a mount point
 */
- 
 int have_submounts(struct dentry *parent)
 {
-        struct dentry *this_parent = parent;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
+        int locked = 0;
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = parent;
-        spin_lock(&dcache_lock);
        if (d_mountpoint(parent))
                goto positive;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -777,27 +1045,65 @@ resume:
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                /* Have we found a mount point ? */
-                if (d_mountpoint(dentry))
+                if (d_mountpoint(dentry)) {
+                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&this_parent->d_lock);
                        goto positive;
+                }
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
+                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                this_parent = this_parent->d_parent;
+                struct dentry *child;
+                tmp = this_parent->d_parent;
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                         (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return 0; /* No mount points found in tree */
 positive:
-        spin_unlock(&dcache_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return 1;
+rename_retry:
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 EXPORT_SYMBOL(have_submounts);
@@ -817,11 +1123,16 @@ EXPORT_SYMBOL(have_submounts);
 */
 static int select_parent(struct dentry * parent)
 {
-        struct dentry *this_parent = parent;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
        int found = 0;
+        int locked = 0;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = parent;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -830,14 +1141,17 @@ resume:
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
-                dentry_lru_del_init(dentry);
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                /* 
                 * move only zero ref count dentries to the end 
                 * of the unused list for prune_dcache
                 */
-                if (!atomic_read(&dentry->d_count)) {
+                if (!dentry->d_count) {
-                        dentry_lru_add_tail(dentry);
+                        dentry_lru_move_tail(dentry);
                        found++;
+                } else {
+                        dentry_lru_del(dentry);
                }
                /*
@@ -845,28 +1159,63 @@ resume:
                 * ensures forward progress). We'll be coming back to find
                 * the rest.
                 */
-                if (found && need_resched())
+                if (found && need_resched()) {
+                        spin_unlock(&dentry->d_lock);
                        goto out;
+                }
                /*
                 * Descend a level if the d_subdirs list is non-empty.
                 */
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
+                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                this_parent = this_parent->d_parent;
+                struct dentry *child;
+                tmp = this_parent->d_parent;
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                        (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
 out:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return found;
+rename_retry:
+        if (found)
+                return found;
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 /**
@@ -905,6 +1254,7 @@ static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
                        return -1;
                prune_dcache(nr);
        }
        return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
@@ -948,37 +1298,54 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        memcpy(dname, name->name, name->len);
        dname[name->len] = 0;
-        atomic_set(&dentry->d_count, 1);
+        dentry->d_count = 1;
        dentry->d_flags = DCACHE_UNHASHED;
        spin_lock_init(&dentry->d_lock);
+        seqcount_init(&dentry->d_seq);
        dentry->d_inode = NULL;
        dentry->d_parent = NULL;
        dentry->d_sb = NULL;
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
-        dentry->d_mounted = 0;
+        INIT_HLIST_BL_NODE(&dentry->d_hash);
-        INIT_HLIST_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
        INIT_LIST_HEAD(&dentry->d_alias);
+        INIT_LIST_HEAD(&dentry->d_u.d_child);
        if (parent) {
-                dentry->d_parent = dget(parent);
+                spin_lock(&parent->d_lock);
+                /*
+                 * don't need child lock because it is not subject
+                 * to concurrency here
+                 */
+                __dget_dlock(parent);
+                dentry->d_parent = parent;
                dentry->d_sb = parent->d_sb;
-        } else {
+                d_set_d_op(dentry, dentry->d_sb->s_d_op);
-                INIT_LIST_HEAD(&dentry->d_u.d_child);
+                list_add(&dentry->d_u.d_child, &parent->d_subdirs);
+                spin_unlock(&parent->d_lock);
        }
-        spin_lock(&dcache_lock);
+        this_cpu_inc(nr_dentry);
-        if (parent)
-                list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-        dentry_stat.nr_dentry++;
-        spin_unlock(&dcache_lock);
        return dentry;
 }
 EXPORT_SYMBOL(d_alloc);
+struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
+{
+        struct dentry *dentry = d_alloc(NULL, name);
+        if (dentry) {
+                dentry->d_sb = sb;
+                d_set_d_op(dentry, dentry->d_sb->s_d_op);
+                dentry->d_parent = dentry;
+                dentry->d_flags |= DCACHE_DISCONNECTED;
+        }
+        return dentry;
+}
+EXPORT_SYMBOL(d_alloc_pseudo);
 struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 {
        struct qstr q;
@@ -990,12 +1357,39 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 }
 EXPORT_SYMBOL(d_alloc_name);
-/* the caller must hold dcache_lock */
+void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+{
+        WARN_ON_ONCE(dentry->d_op);
+        WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH  |
+                                DCACHE_OP_COMPARE       |
+                                DCACHE_OP_REVALIDATE    |
+                                DCACHE_OP_DELETE ));
+        dentry->d_op = op;
+        if (!op)
+                return;
+        if (op->d_hash)
+                dentry->d_flags |= DCACHE_OP_HASH;
+        if (op->d_compare)
+                dentry->d_flags |= DCACHE_OP_COMPARE;
+        if (op->d_revalidate)
+                dentry->d_flags |= DCACHE_OP_REVALIDATE;
+        if (op->d_delete)
+                dentry->d_flags |= DCACHE_OP_DELETE;
+}
+EXPORT_SYMBOL(d_set_d_op);
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
-        if (inode)
+        spin_lock(&dentry->d_lock);
+        if (inode) {
+                if (unlikely(IS_AUTOMOUNT(inode)))
+                        dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
                list_add(&dentry->d_alias, &inode->i_dentry);
+        }
        dentry->d_inode = inode;
+        dentry_rcuwalk_barrier(dentry);
+        spin_unlock(&dentry->d_lock);
        fsnotify_d_instantiate(dentry, inode);
 }
@@ -1017,9 +1411,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 void d_instantiate(struct dentry *entry, struct inode * inode)
 {
        BUG_ON(!list_empty(&entry->d_alias));
-        spin_lock(&dcache_lock);
+        if (inode)
+                spin_lock(&inode->i_lock);
        __d_instantiate(entry, inode);
-        spin_unlock(&dcache_lock);
+        if (inode)
+                spin_unlock(&inode->i_lock);
        security_d_instantiate(entry, inode);
 }
 EXPORT_SYMBOL(d_instantiate);
@@ -1056,15 +1452,18 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
                struct qstr *qstr = &alias->d_name;
+                /*
+                 * Don't need alias->d_lock here, because aliases with
+                 * d_parent == entry->d_parent are not subject to name or
+                 * parent changes, because the parent inode i_mutex is held.
+                 */
                if (qstr->hash != hash)
                        continue;
                if (alias->d_parent != entry->d_parent)
                        continue;
-                if (qstr->len != len)
+                if (dentry_cmp(qstr->name, qstr->len, name, len))
                        continue;
-                if (memcmp(qstr->name, name, len))
+                __dget(alias);
-                        continue;
-                dget_locked(alias);
                return alias;
        }
@@ -1078,9 +1477,11 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
        BUG_ON(!list_empty(&entry->d_alias));
-        spin_lock(&dcache_lock);
+        if (inode)
+                spin_lock(&inode->i_lock);
        result = __d_instantiate_unique(entry, inode);
-        spin_unlock(&dcache_lock);
+        if (inode)
+                spin_unlock(&inode->i_lock);
        if (!result) {
                security_d_instantiate(entry, inode);
@@ -1113,6 +1514,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
                res = d_alloc(NULL, &name);
                if (res) {
                        res->d_sb = root_inode->i_sb;
+                        d_set_d_op(res, res->d_sb->s_d_op);
                        res->d_parent = res;
                        d_instantiate(res, root_inode);
                }
@@ -1121,14 +1523,6 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
-static inline struct hlist_head *d_hash(struct dentry *parent,
-                                        unsigned long hash)
-{
-        hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
-        hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
-        return dentry_hashtable + (hash & D_HASHMASK);
-}
 /**
 * d_obtain_alias - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
@@ -1169,10 +1563,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
        }
        tmp->d_parent = tmp; /* make sure dput doesn't croak */
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        res = __d_find_alias(inode, 0);
        if (res) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                dput(tmp);
                goto out_iput;
        }
@@ -1180,14 +1575,17 @@ struct dentry *d_obtain_alias(struct inode *inode)
        /* attach a disconnected dentry */
        spin_lock(&tmp->d_lock);
        tmp->d_sb = inode->i_sb;
+        d_set_d_op(tmp, tmp->d_sb->s_d_op);
        tmp->d_inode = inode;
        tmp->d_flags |= DCACHE_DISCONNECTED;
-        tmp->d_flags &= ~DCACHE_UNHASHED;
        list_add(&tmp->d_alias, &inode->i_dentry);
-        hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon);
+        bit_spin_lock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
+        tmp->d_flags &= ~DCACHE_UNHASHED;
+        hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
+        __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
        spin_unlock(&tmp->d_lock);
+        spin_unlock(&inode->i_lock);
-        spin_unlock(&dcache_lock);
        return tmp;
 out_iput:
@@ -1217,18 +1615,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
        struct dentry *new = NULL;
        if (inode && S_ISDIR(inode->i_mode)) {
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                new = __d_find_alias(inode, 1);
                if (new) {
                        BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        security_d_instantiate(new, inode);
                        d_move(new, dentry);
                        iput(inode);
                } else {
-                        /* already taking dcache_lock, so d_add() by hand */
+                        /* already taking inode->i_lock, so d_add() by hand */
                        __d_instantiate(dentry, inode);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        security_d_instantiate(dentry, inode);
                        d_rehash(dentry);
                }
@@ -1301,10 +1699,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
         * Negative dentry: instantiate it unless the inode is a directory and
         * already has a dentry.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
                __d_instantiate(found, inode);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                security_d_instantiate(found, inode);
                return found;
        }
@@ -1314,8 +1712,8 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
         * reference to it, move it in place and use it.
         */
        new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-        dget_locked(new);
+        __dget(new);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        security_d_instantiate(found, inode);
        d_move(new, found);
        iput(inode);
@@ -1329,6 +1727,112 @@ err_out:
 EXPORT_SYMBOL(d_add_ci);
 /**
+ * __d_lookup_rcu - search for a dentry (racy, store-free)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * @seq: returns d_seq value at the point where the dentry was found
+ * @inode: returns dentry->d_inode when the inode was found valid.
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup_rcu is the dcache lookup function for rcu-walk name
+ * resolution (store-free path walking) design described in
+ * Documentation/filesystems/path-lookup.txt.
+ *
+ * This is not to be used outside core vfs.
+ *
+ * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
+ * held, and rcu_read_lock held. The returned dentry must not be stored into
+ * without taking d_lock and checking d_seq sequence count against @seq
+ * returned here.
+ *
+ * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
+ * function.
+ *
+ * Alternatively, __d_lookup_rcu may be called again to look up the child of
+ * the returned dentry, so long as its parent's seqlock is checked after the
+ * child is looked up. Thus, an interlocking stepping of sequence lock checks
+ * is formed, giving integrity down the path walk.
+ */
+struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
+                                unsigned *seq, struct inode **inode)
+{
+        unsigned int len = name->len;
+        unsigned int hash = name->hash;
+        const unsigned char *str = name->name;
+        struct dcache_hash_bucket *b = d_hash(parent, hash);
+        struct hlist_bl_node *node;
+        struct dentry *dentry;
+        /*
+         * Note: There is significant duplication with __d_lookup_rcu which is
+         * required to prevent single threaded performance regressions
+         * especially on architectures where smp_rmb (in seqcounts) are costly.
+         * Keep the two functions in sync.
+         */
+        /*
+         * The hash list is protected using RCU.
+         *
+         * Carefully use d_seq when comparing a candidate dentry, to avoid
+         * races with d_move().
+         *
+         * It is possible that concurrent renames can mess up our list
+         * walk here and result in missing our dentry, resulting in the
+         * false-negative result. d_lookup() protects against concurrent
+         * renames using rename_lock seqlock.
+         *
+         * See Documentation/vfs/dcache-locking.txt for more details.
+         */
+        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
+                struct inode *i;
+                const char *tname;
+                int tlen;
+                if (dentry->d_name.hash != hash)
+                        continue;
+seqretry:
+                *seq = read_seqcount_begin(&dentry->d_seq);
+                if (dentry->d_parent != parent)
+                        continue;
+                if (d_unhashed(dentry))
+                        continue;
+                tlen = dentry->d_name.len;
+                tname = dentry->d_name.name;
+                i = dentry->d_inode;
+                prefetch(tname);
+                if (i)
+                        prefetch(i);
+                /*
+                 * This seqcount check is required to ensure name and
+                 * len are loaded atomically, so as not to walk off the
+                 * edge of memory when walking. If we could load this
+                 * atomically some other way, we could drop this check.
+                 */
+                if (read_seqcount_retry(&dentry->d_seq, *seq))
+                        goto seqretry;
+                if (parent->d_flags & DCACHE_OP_COMPARE) {
+                        if (parent->d_op->d_compare(parent, *inode,
+                                                dentry, i,
+                                                tlen, tname, name))
+                                continue;
+                } else {
+                        if (dentry_cmp(tname, tlen, str, len))
+                                continue;
+                }
+                /*
+                 * No extra seqcount check is required after the name
+                 * compare. The caller must perform a seqcount check in
+                 * order to do anything useful with the returned dentry
+                 * anyway.
+                 */
+                *inode = i;
+                return dentry;
+        }
+        return NULL;
+}
+/**
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
@@ -1339,10 +1843,10 @@ EXPORT_SYMBOL(d_add_ci);
 * dentry is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned if the dentry does not exist.
 */
-struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
 {
-        struct dentry * dentry = NULL;
+        struct dentry *dentry;
-        unsigned long seq;
+        unsigned seq;
        do {
                seq = read_seqbegin(&rename_lock);
@@ -1354,7 +1858,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
-/*
+/**
 * __d_lookup - search for a dentry (racy)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
@@ -1369,17 +1873,24 @@ EXPORT_SYMBOL(d_lookup);
 *
 * __d_lookup callers must be commented.
 */
-struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 {
        unsigned int len = name->len;
        unsigned int hash = name->hash;
        const unsigned char *str = name->name;
-        struct hlist_head *head = d_hash(parent,hash);
+        struct dcache_hash_bucket *b = d_hash(parent, hash);
+        struct hlist_bl_node *node;
        struct dentry *found = NULL;
-        struct hlist_node *node;
        struct dentry *dentry;
        /*
+         * Note: There is significant duplication with __d_lookup_rcu which is
+         * required to prevent single threaded performance regressions
+         * especially on architectures where smp_rmb (in seqcounts) are costly.
+         * Keep the two functions in sync.
+         */
+        /*
         * The hash list is protected using RCU.
         *
         * Take d_lock when comparing a candidate dentry, to avoid races
@@ -1394,25 +1905,16 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
         */
        rcu_read_lock();
        
-        hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
+        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
-                struct qstr *qstr;
+                const char *tname;
+                int tlen;
                if (dentry->d_name.hash != hash)
                        continue;
-                if (dentry->d_parent != parent)
-                        continue;
                spin_lock(&dentry->d_lock);
-                /*
-                 * Recheck the dentry after taking the lock - d_move may have
-                 * changed things. Don't bother checking the hash because
-                 * we're about to compare the whole name anyway.
-                 */
                if (dentry->d_parent != parent)
                        goto next;
-                /* non-existing due to RCU? */
                if (d_unhashed(dentry))
                        goto next;
@@ -1420,18 +1922,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                 * It is safe to compare names since d_move() cannot
                 * change the qstr (protected by d_lock).
                 */
-                qstr = &dentry->d_name;
+                tlen = dentry->d_name.len;
-                if (parent->d_op && parent->d_op->d_compare) {
+                tname = dentry->d_name.name;
-                        if (parent->d_op->d_compare(parent, qstr, name))
+                if (parent->d_flags & DCACHE_OP_COMPARE) {
+                        if (parent->d_op->d_compare(parent, parent->d_inode,
+                                                dentry, dentry->d_inode,
+                                                tlen, tname, name))
                                goto next;
                } else {
-                        if (qstr->len != len)
+                        if (dentry_cmp(tname, tlen, str, len))
-                                goto next;
-                        if (memcmp(qstr->name, str, len))
                                goto next;
                }
-                atomic_inc(&dentry->d_count);
+                dentry->d_count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
@@ -1460,8 +1963,8 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
         * routine may choose to leave the hash value unchanged.
         */
        name->hash = full_name_hash(name->name, name->len);
-        if (dir->d_op && dir->d_op->d_hash) {
+        if (dir->d_flags & DCACHE_OP_HASH) {
-                if (dir->d_op->d_hash(dir, name) < 0)
+                if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
                        goto out;
        }
        dentry = d_lookup(dir, name);
@@ -1470,41 +1973,32 @@ out:
 }
 /**
- * d_validate - verify dentry provided from insecure source
+ * d_validate - verify dentry provided from insecure source (deprecated)
 * @dentry: The dentry alleged to be valid child of @dparent
 * @dparent: The parent dentry (known to be valid)
 *
 * An insecure source has sent us a dentry, here we verify it and dget() it.
 * This is used by ncpfs in its readdir implementation.
 * Zero is returned in the dentry is invalid.
+ *
+ * This function is slow for big directories, and deprecated, do not use it.
 */
- 
 int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
-        struct hlist_head *base;
+        struct dentry *child;
-        struct hlist_node *lhp;
-        /* Check whether the ptr might be valid at all.. */
-        if (!kmem_ptr_validate(dentry_cache, dentry))
-                goto out;
-        if (dentry->d_parent != dparent)
-                goto out;
-        spin_lock(&dcache_lock);
+        spin_lock(&dparent->d_lock);
-        base = d_hash(dparent, dentry->d_name.hash);
+        list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
-        hlist_for_each(lhp,base) { 
+                if (dentry == child) {
-                /* hlist_for_each_entry_rcu() not required for d_hash list
+                        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-                 * as it is parsed under dcache_lock
+                        __dget_dlock(dentry);
-                 */
+                        spin_unlock(&dentry->d_lock);
-                if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
+                        spin_unlock(&dparent->d_lock);
-                        __dget_locked(dentry);
-                        spin_unlock(&dcache_lock);
                        return 1;
                }
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dparent->d_lock);
-out:
        return 0;
 }
 EXPORT_SYMBOL(d_validate);
@@ -1532,16 +2026,23 @@ EXPORT_SYMBOL(d_validate);
 
 void d_delete(struct dentry * dentry)
 {
+        struct inode *inode;
        int isdir = 0;
        /*
         * Are we the only user?
         */
-        spin_lock(&dcache_lock);
+again:
        spin_lock(&dentry->d_lock);
-        isdir = S_ISDIR(dentry->d_inode->i_mode);
+        inode = dentry->d_inode;
-        if (atomic_read(&dentry->d_count) == 1) {
+        isdir = S_ISDIR(inode->i_mode);
+        if (dentry->d_count == 1) {
+                if (inode && !spin_trylock(&inode->i_lock)) {
+                        spin_unlock(&dentry->d_lock);
+                        cpu_relax();
+                        goto again;
+                }
                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
-                dentry_iput(dentry);
+                dentry_unlink_inode(dentry);
                fsnotify_nameremove(dentry, isdir);
                return;
        }
@@ -1550,17 +2051,18 @@ void d_delete(struct dentry * dentry)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        fsnotify_nameremove(dentry, isdir);
 }
 EXPORT_SYMBOL(d_delete);
-static void __d_rehash(struct dentry * entry, struct hlist_head *list)
+static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b)
 {
+        BUG_ON(!d_unhashed(entry));
+        spin_lock_bucket(b);
        entry->d_flags &= ~DCACHE_UNHASHED;
-        hlist_add_head_rcu(&entry->d_hash, list);
+        hlist_bl_add_head_rcu(&entry->d_hash, &b->head);
+        spin_unlock_bucket(b);
 }
 static void _d_rehash(struct dentry * entry)
@@ -1577,25 +2079,39 @@ static void _d_rehash(struct dentry * entry)
 
 void d_rehash(struct dentry * entry)
 {
-        spin_lock(&dcache_lock);
        spin_lock(&entry->d_lock);
        _d_rehash(entry);
        spin_unlock(&entry->d_lock);
-        spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_rehash);
-/*
+/**
- * When switching names, the actual string doesn't strictly have to
+ * dentry_update_name_case - update case insensitive dentry with a new name
- * be preserved in the target - because we're dropping the target
+ * @dentry: dentry to be updated
- * anyway. As such, we can just do a simple memcpy() to copy over
+ * @name: new name
- * the new name before we switch.
 *
- * Note that we have to be a lot more careful about getting the hash
+ * Update a case insensitive dentry with new case of name.
- * switched - we have to switch the hash value properly even if it
+ *
- * then no longer matches the actual (corrupted) string of the target.
+ * dentry must have been returned by d_lookup with name @name. Old and new
- * The hash value has to match the hash queue that the dentry is on..
+ * name lengths must match (ie. no d_compare which allows mismatched name
+ * lengths).
+ *
+ * Parent inode i_mutex must be held over d_lookup and into this call (to
+ * keep renames and concurrent inserts, and readdir(2) away).
 */
+void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
+{
+        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+        BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
+        spin_lock(&dentry->d_lock);
+        write_seqcount_begin(&dentry->d_seq);
+        memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
+        write_seqcount_end(&dentry->d_seq);
+        spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(dentry_update_name_case);
 static void switch_names(struct dentry *dentry, struct dentry *target)
 {
        if (dname_external(target)) {
@@ -1637,54 +2153,84 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
        swap(dentry->d_name.len, target->d_name.len);
 }
+static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
+{
+        /*
+         * XXXX: do we really need to take target->d_lock?
+         */
+        if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
+                spin_lock(&target->d_parent->d_lock);
+        else {
+                if (d_ancestor(dentry->d_parent, target->d_parent)) {
+                        spin_lock(&dentry->d_parent->d_lock);
+                        spin_lock_nested(&target->d_parent->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
+                } else {
+                        spin_lock(&target->d_parent->d_lock);
+                        spin_lock_nested(&dentry->d_parent->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
+                }
+        }
+        if (target < dentry) {
+                spin_lock_nested(&target->d_lock, 2);
+                spin_lock_nested(&dentry->d_lock, 3);
+        } else {
+                spin_lock_nested(&dentry->d_lock, 2);
+                spin_lock_nested(&target->d_lock, 3);
+        }
+}
+static void dentry_unlock_parents_for_move(struct dentry *dentry,
+                                        struct dentry *target)
+{
+        if (target->d_parent != dentry->d_parent)
+                spin_unlock(&dentry->d_parent->d_lock);
+        if (target->d_parent != target)
+                spin_unlock(&target->d_parent->d_lock);
+}
 /*
- * We cannibalize "target" when moving dentry on top of it,
+ * When switching names, the actual string doesn't strictly have to
- * because it's going to be thrown away anyway. We could be more
+ * be preserved in the target - because we're dropping the target
- * polite about it, though.
+ * anyway. As such, we can just do a simple memcpy() to copy over
- *
+ * the new name before we switch.
- * This forceful removal will result in ugly /proc output if
+ *
- * somebody holds a file open that got deleted due to a rename.
+ * Note that we have to be a lot more careful about getting the hash
- * We could be nicer about the deleted file, and let it show
+ * switched - we have to switch the hash value properly even if it
- * up under the name it had before it was deleted rather than
+ * then no longer matches the actual (corrupted) string of the target.
- * under the original name of the file that was moved on top of it.
+ * The hash value has to match the hash queue that the dentry is on..
 */
- 
 /*
- * d_move_locked - move a dentry
+ * d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way.
 */
-static void d_move_locked(struct dentry * dentry, struct dentry * target)
+void d_move(struct dentry * dentry, struct dentry * target)
 {
-        struct hlist_head *list;
        if (!dentry->d_inode)
                printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+        BUG_ON(d_ancestor(dentry, target));
+        BUG_ON(d_ancestor(target, dentry));
        write_seqlock(&rename_lock);
-        /*
-         * XXXX: do we really need to take target->d_lock?
-         */
-        if (target < dentry) {
-                spin_lock(&target->d_lock);
-                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-        } else {
-                spin_lock(&dentry->d_lock);
-                spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
-        }
-        /* Move the dentry to the target hash queue, if on different bucket */
+        dentry_lock_for_move(dentry, target);
-        if (d_unhashed(dentry))
-                goto already_unhashed;
-        hlist_del_rcu(&dentry->d_hash);
+        write_seqcount_begin(&dentry->d_seq);
+        write_seqcount_begin(&target->d_seq);
-already_unhashed:
+        /* __d_drop does write_seqcount_barrier, but they're OK to nest. */
-        list = d_hash(target->d_parent, target->d_name.hash);
-        __d_rehash(dentry, list);
+        /*
+         * Move the dentry to the target hash queue. Don't bother checking
+         * for the same hash queue because of how unlikely it is.
+         */
+        __d_drop(dentry);
+        __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
        /* Unhash the target: dput() will then get rid of it */
        __d_drop(target);
@@ -1709,27 +2255,16 @@ already_unhashed:
        }
        list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+        write_seqcount_end(&target->d_seq);
+        write_seqcount_end(&dentry->d_seq);
+        dentry_unlock_parents_for_move(dentry, target);
        spin_unlock(&target->d_lock);
        fsnotify_d_move(dentry);
        spin_unlock(&dentry->d_lock);
        write_sequnlock(&rename_lock);
 }
-/**
- * d_move - move a dentry
- * @dentry: entry to move
- * @target: new dentry
- *
- * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way.
- */
-void d_move(struct dentry * dentry, struct dentry * target)
-{
-        spin_lock(&dcache_lock);
-        d_move_locked(dentry, target);
-        spin_unlock(&dcache_lock);
-}
 EXPORT_SYMBOL(d_move);
 /**
@@ -1755,13 +2290,13 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
 * This helper attempts to cope with remotely renamed directories
 *
 * It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex and the dcache_lock
+ * dentry->d_parent->d_inode->i_mutex and the inode->i_lock
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
 */
-static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
+static struct dentry *__d_unalias(struct inode *inode,
-        __releases(dcache_lock)
+                struct dentry *dentry, struct dentry *alias)
 {
        struct mutex *m1 = NULL, *m2 = NULL;
        struct dentry *ret;
@@ -1784,10 +2319,10 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_mutex;
 out_unalias:
-        d_move_locked(alias, dentry);
+        d_move(alias, dentry);
        ret = alias;
 out_err:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        if (m2)
                mutex_unlock(m2);
        if (m1)
@@ -1798,17 +2333,23 @@ out_err:
 /*
 * Prepare an anonymous dentry for life in the superblock's dentry tree as a
 * named dentry in place of the dentry to be replaced.
+ * returns with anon->d_lock held!
 */
 static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 {
        struct dentry *dparent, *aparent;
-        switch_names(dentry, anon);
+        dentry_lock_for_move(anon, dentry);
-        swap(dentry->d_name.hash, anon->d_name.hash);
+        write_seqcount_begin(&dentry->d_seq);
+        write_seqcount_begin(&anon->d_seq);
        dparent = dentry->d_parent;
        aparent = anon->d_parent;
+        switch_names(dentry, anon);
+        swap(dentry->d_name.hash, anon->d_name.hash);
        dentry->d_parent = (aparent == anon) ? dentry : aparent;
        list_del(&dentry->d_u.d_child);
        if (!IS_ROOT(dentry))
@@ -1823,6 +2364,13 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        else
                INIT_LIST_HEAD(&anon->d_u.d_child);
+        write_seqcount_end(&dentry->d_seq);
+        write_seqcount_end(&anon->d_seq);
+        dentry_unlock_parents_for_move(anon, dentry);
+        spin_unlock(&dentry->d_lock);
+        /* anon->d_lock still locked, returns locked */
        anon->d_flags &= ~DCACHE_DISCONNECTED;
 }
@@ -1840,14 +2388,15 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
        BUG_ON(!d_unhashed(dentry));
-        spin_lock(&dcache_lock);
        if (!inode) {
                actual = dentry;
                __d_instantiate(dentry, NULL);
-                goto found_lock;
+                d_rehash(actual);
+                goto out_nolock;
        }
+        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *alias;
@@ -1858,13 +2407,12 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                        /* Is this an anonymous mountpoint that we could splice
                         * into our tree? */
                        if (IS_ROOT(alias)) {
-                                spin_lock(&alias->d_lock);
                                __d_materialise_dentry(dentry, alias);
                                __d_drop(alias);
                                goto found;
                        }
                        /* Nope, but we must(!) avoid directory aliasing */
-                        actual = __d_unalias(dentry, alias);
+                        actual = __d_unalias(inode, dentry, alias);
                        if (IS_ERR(actual))
                                dput(alias);
                        goto out_nolock;
@@ -1875,15 +2423,14 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
        actual = __d_instantiate_unique(dentry, inode);
        if (!actual)
                actual = dentry;
-        else if (unlikely(!d_unhashed(actual)))
+        else
-                goto shouldnt_be_hashed;
+                BUG_ON(!d_unhashed(actual));
-found_lock:
        spin_lock(&actual->d_lock);
 found:
        _d_rehash(actual);
        spin_unlock(&actual->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 out_nolock:
        if (actual == dentry) {
                security_d_instantiate(dentry, inode);
@@ -1892,10 +2439,6 @@ out_nolock:
        iput(inode);
        return actual;
-shouldnt_be_hashed:
-        spin_unlock(&dcache_lock);
-        BUG();
 }
 EXPORT_SYMBOL_GPL(d_materialise_unique);
@@ -1915,14 +2458,13 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 }
 /**
- * Prepend path string to a buffer
+ * prepend_path - Prepend path string to a buffer
- *
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry (may be modified by this function)
 * @buffer: pointer to the end of the buffer
 * @buflen: pointer to buffer length
 *
- * Caller holds the dcache_lock.
+ * Caller holds the rename_lock.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
@@ -1950,7 +2492,9 @@ static int prepend_path(const struct path *path, struct path *root,
                }
                parent = dentry->d_parent;
                prefetch(parent);
+                spin_lock(&dentry->d_lock);
                error = prepend_name(buffer, buflen, &dentry->d_name);
+                spin_unlock(&dentry->d_lock);
                if (!error)
                        error = prepend(buffer, buflen, "/", 1);
                if (error)
@@ -1994,7 +2538,7 @@ global_root:
 * Returns a pointer into the buffer or an error code if the
 * path was too long.
 *
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
@@ -2006,10 +2550,12 @@ char *__d_path(const struct path *path, struct path *root,
        int error;
        prepend(&res, &buflen, "\0", 1);
+        write_seqlock(&rename_lock);
        error = prepend_path(path, root, &res, &buflen);
+        write_sequnlock(&rename_lock);
        if (error)
                return ERR_PTR(error);
        return res;
 }
@@ -2068,12 +2614,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
        get_fs_root(current->fs, &root);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        tmp = root;
        error = path_with_deleted(path, &tmp, &res, &buflen);
        if (error)
                res = ERR_PTR(error);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        path_put(&root);
        return res;
 }
@@ -2099,12 +2645,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
        get_fs_root(current->fs, &root);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        tmp = root;
        error = path_with_deleted(path, &tmp, &res, &buflen);
        if (!error && !path_equal(&tmp, &root))
                error = prepend_unreachable(&res, &buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        path_put(&root);
        if (error)
                res =  ERR_PTR(error);
@@ -2136,7 +2682,7 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
 /*
 * Write full pathname from the root of the filesystem into the buffer.
 */
-char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
+static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
        char *end = buf + buflen;
        char *retval;
@@ -2150,10 +2696,13 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
        while (!IS_ROOT(dentry)) {
                struct dentry *parent = dentry->d_parent;
+                int error;
                prefetch(parent);
-                if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+                spin_lock(&dentry->d_lock);
-                    (prepend(&end, &buflen, "/", 1) != 0))
+                error = prepend_name(&end, &buflen, &dentry->d_name);
+                spin_unlock(&dentry->d_lock);
+                if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
                        goto Elong;
                retval = end;
@@ -2163,14 +2712,25 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
-EXPORT_SYMBOL(__dentry_path);
+char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+{
+        char *retval;
+        write_seqlock(&rename_lock);
+        retval = __dentry_path(dentry, buf, buflen);
+        write_sequnlock(&rename_lock);
+        return retval;
+}
+EXPORT_SYMBOL(dentry_path_raw);
 char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
        char *p = NULL;
        char *retval;
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        if (d_unlinked(dentry)) {
                p = buf + buflen;
                if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2178,12 +2738,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
                buflen++;
        }
        retval = __dentry_path(dentry, buf, buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        if (!IS_ERR(retval) && p)
                *p = '/';       /* restore '/' overriden with '\0' */
        return retval;
 Elong:
-        spin_unlock(&dcache_lock);
        return ERR_PTR(-ENAMETOOLONG);
 }
@@ -2217,7 +2776,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        get_fs_root_and_pwd(current->fs, &root, &pwd);
        error = -ENOENT;
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
@@ -2226,7 +2785,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
                prepend(&cwd, &buflen, "\0", 1);
                error = prepend_path(&pwd, &tmp, &cwd, &buflen);
-                spin_unlock(&dcache_lock);
+                write_sequnlock(&rename_lock);
                if (error)
                        goto out;
@@ -2245,8 +2804,9 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
                        if (copy_to_user(buf, cwd, len))
                                error = -EFAULT;
                }
-        } else
+        } else {
-                spin_unlock(&dcache_lock);
+                write_sequnlock(&rename_lock);
+        }
 out:
        path_put(&pwd);
@@ -2274,25 +2834,25 @@ out:
 int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
        int result;
-        unsigned long seq;
+        unsigned seq;
        if (new_dentry == old_dentry)
                return 1;
-        /*
-         * Need rcu_readlock to protect against the d_parent trashing
-         * due to d_move
-         */
-        rcu_read_lock();
        do {
                /* for restarting inner loop in case of seq retry */
                seq = read_seqbegin(&rename_lock);
+                /*
+                 * Need rcu_readlock to protect against the d_parent trashing
+                 * due to d_move
+                 */
+                rcu_read_lock();
                if (d_ancestor(old_dentry, new_dentry))
                        result = 1;
                else
                        result = 0;
+                rcu_read_unlock();
        } while (read_seqretry(&rename_lock, seq));
-        rcu_read_unlock();
        return result;
 }
@@ -2324,10 +2884,15 @@ EXPORT_SYMBOL(path_is_under);
 void d_genocide(struct dentry *root)
 {
-        struct dentry *this_parent = root;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
+        int locked = 0;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = root;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -2335,21 +2900,62 @@ resume:
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
-                if (d_unhashed(dentry)||!dentry->d_inode)
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+                if (d_unhashed(dentry) || !dentry->d_inode) {
+                        spin_unlock(&dentry->d_lock);
                        continue;
+                }
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
-                atomic_dec(&dentry->d_count);
+                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+                        dentry->d_flags |= DCACHE_GENOCIDE;
+                        dentry->d_count--;
+                }
+                spin_unlock(&dentry->d_lock);
        }
        if (this_parent != root) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                atomic_dec(&this_parent->d_count);
+                struct dentry *child;
-                this_parent = this_parent->d_parent;
+                tmp = this_parent->d_parent;
+                if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
+                        this_parent->d_flags |= DCACHE_GENOCIDE;
+                        this_parent->d_count--;
+                }
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                         (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
+        return;
+rename_retry:
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 /**
@@ -2403,7 +3009,7 @@ static void __init dcache_init_early(void)
        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
-                                        sizeof(struct hlist_head),
+                                        sizeof(struct dcache_hash_bucket),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY,
@@ -2412,7 +3018,7 @@ static void __init dcache_init_early(void)
                                        0);
        for (loop = 0; loop < (1 << d_hash_shift); loop++)
-                INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+                INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
 }
 static void __init dcache_init(void)
@@ -2435,7 +3041,7 @@ static void __init dcache_init(void)
        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
-                                        sizeof(struct hlist_head),
+                                        sizeof(struct dcache_hash_bucket),
                                        dhash_entries,
                                        13,
                                        0,
@@ -2444,7 +3050,7 @@ static void __init dcache_init(void)
                                        0);
        for (loop = 0; loop < (1 << d_hash_shift); loop++)
-                INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+                INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
 }
 /* SLAB cache for __getname() consumers */
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 0210898458b2..89d394d8fe24 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -43,6 +43,7 @@ const struct file_operations debugfs_file_operations = {
        .read =         default_read_file,
        .write =        default_write_file,
        .open =         default_open,
+        .llseek =       noop_llseek,
 };
 static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -454,6 +455,7 @@ static const struct file_operations fops_bool = {
        .read =         read_file_bool,
        .write =        write_file_bool,
        .open =         default_open,
+        .llseek =       default_llseek,
 };
 /**
@@ -498,6 +500,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
 static const struct file_operations fops_blob = {
        .read =         read_file_blob,
        .open =         default_open,
+        .llseek =       default_llseek,
 };
 /**
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3dbcac..37a8ca7c1222 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
        struct inode *inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
@@ -134,17 +135,17 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
        return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
-static int debug_get_sb(struct file_system_type *fs_type,
+static struct dentry *debug_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
+        return mount_single(fs_type, flags, data, debug_fill_super);
 }
 static struct file_system_type debug_fs_type = {
        .owner =        THIS_MODULE,
        .name =         "debugfs",
-        .get_sb =       debug_get_sb,
+        .mount =        debug_mount,
        .kill_sb =      kill_litter_super,
 };
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8b3ffd5b5235..1bb547c9cad6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -331,7 +331,7 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 }
 /*
- * devpts_get_sb()
+ * devpts_mount()
 *
 *     If the '-o newinstance' mount option was specified, mount a new
 *     (private) instance of devpts.  PTYs created in this instance are
@@ -345,20 +345,20 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 *     semantics in devpts while preserving backward compatibility of the
 *     current 'single-namespace' semantics. i.e all mounts of devpts
 *     without the 'newinstance' mount option should bind to the initial
- *     kernel mount, like get_sb_single().
+ *     kernel mount, like mount_single().
 *
 *     Mounts with 'newinstance' option create a new, private namespace.
 *
 *     NOTE:
 *
- *     For single-mount semantics, devpts cannot use get_sb_single(),
+ *     For single-mount semantics, devpts cannot use mount_single(),
- *     because get_sb_single()/sget() find and use the super-block from
+ *     because mount_single()/sget() find and use the super-block from
 *     the most recent mount of devpts. But that recent mount may be a
- *     'newinstance' mount and get_sb_single() would pick the newinstance
+ *     'newinstance' mount and mount_single() would pick the newinstance
 *     super-block instead of the initial super-block.
 */
-static int devpts_get_sb(struct file_system_type *fs_type,
+static struct dentry *devpts_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
        int error;
        struct pts_mount_opts opts;
@@ -366,7 +366,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
        error = parse_mount_options(data, PARSE_MOUNT, &opts);
        if (error)
-                return error;
+                return ERR_PTR(error);
        if (opts.newinstance)
                s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -374,7 +374,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
                s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        if (!s->s_root) {
                s->s_flags = flags;
@@ -390,13 +390,11 @@ static int devpts_get_sb(struct file_system_type *fs_type,
        if (error)
                goto out_undo_sget;
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 out_undo_sget:
        deactivate_locked_super(s);
-        return error;
+        return ERR_PTR(error);
 }
 #else
@@ -404,10 +402,10 @@ out_undo_sget:
 * This supports only the legacy single-instance semantics (no
 * multiple-instance semantics)
 */
-static int devpts_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
-                const char *dev_name, void *data, struct vfsmount *mnt)
+                const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
+        return mount_single(fs_type, flags, data, devpts_fill_super);
 }
 #endif
@@ -421,7 +419,7 @@ static void devpts_kill_sb(struct super_block *sb)
 static struct file_system_type devpts_fs_type = {
        .name           = "devpts",
-        .get_sb         = devpts_get_sb,
+        .mount          = devpts_mount,
        .kill_sb        = devpts_kill_sb,
 };
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 48d74c7391d1..b044705eedd4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
 * filesystems can use it to hold additional state between get_block calls and
 * dio_complete.
 */
-static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
 {
        ssize_t transferred = 0;
@@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error)
 }
 EXPORT_SYMBOL_GPL(dio_end_io);
-static int
+static void
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                sector_t first_sector, int nr_vecs)
 {
        struct bio *bio;
+        /*
+         * bio_alloc() is guaranteed to return a bio when called with
+         * __GFP_WAIT and we request a valid number of vectors.
+         */
        bio = bio_alloc(GFP_KERNEL, nr_vecs);
        bio->bi_bdev = bdev;
@@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
        dio->bio = bio;
        dio->logical_offset_in_bio = dio->cur_page_fs_offset;
-        return 0;
 }
 /*
@@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector)
                goto out;
        sector = start_sector << (dio->blkbits - 9);
        nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+        nr_pages = min(nr_pages, BIO_MAX_PAGES);
        BUG_ON(nr_pages <= 0);
-        ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
+        dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
        dio->boundary = 0;
 out:
        return ret;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 2dbb422e8116..1897eb1b4b6a 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,8 +1,7 @@
 menuconfig DLM
        tristate "Distributed Lock Manager (DLM)"
        depends on EXPERIMENTAL && INET
-        depends on SYSFS && (IPV6 || IPV6=n)
+        depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
-        select CONFIGFS_FS
        select IP_SCTP
        help
        A general purpose distributed lock manager for kernel or userspace
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index c6cf25158746..6b42ba807dfd 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -643,7 +643,8 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
 static const struct file_operations waiters_fops = {
        .owner   = THIS_MODULE,
        .open    = waiters_open,
-        .read    = waiters_read
+        .read    = waiters_read,
+        .llseek  = default_llseek,
 };
 void dlm_delete_debug_file(struct dlm_ls *ls)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 031dbe3a15ca..64e5f3efdd81 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1846,6 +1846,9 @@ static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
        struct dlm_lkb *gr;
        list_for_each_entry(gr, head, lkb_statequeue) {
+                /* skip self when sending basts to convertqueue */
+                if (gr == lkb)
+                        continue;
                if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
                        queue_bast(r, gr, lkb->lkb_rqmode);
                        gr->lkb_highbast = lkb->lkb_rqmode;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622a..9c64ae9e4c1a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,9 @@
 #define NEEDED_RMEM (4*1024*1024)
 #define CONN_HASH_SIZE 32
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
 struct cbuf {
        unsigned int base;
        unsigned int len;
@@ -108,6 +111,7 @@ struct connection {
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
+#define CF_APP_LIMITED 7
        struct list_head writequeue;  /* List of outgoing writequeue_entries */
        spinlock_t writequeue_lock;
        int (*rx_action) (struct connection *); /* What to do when active */
@@ -295,7 +299,17 @@ static void lowcomms_write_space(struct sock *sk)
 {
        struct connection *con = sock2con(sk);
-        if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+        if (!con)
+                return;
+        clear_bit(SOCK_NOSPACE, &con->sock->flags);
+        if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+                con->sock->sk->sk_write_pending--;
+                clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+        }
+        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
                queue_work(send_workqueue, &con->swork);
 }
@@ -915,6 +929,7 @@ static void tcp_connect_to_sock(struct connection *con)
        struct sockaddr_storage saddr, src_addr;
        int addr_len;
        struct socket *sock = NULL;
+        int one = 1;
        if (con->nodeid == 0) {
                log_print("attempt to connect sock 0 foiled");
@@ -960,6 +975,11 @@ static void tcp_connect_to_sock(struct connection *con)
        make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
        log_print("connecting to %d", con->nodeid);
+        /* Turn off Nagle's algorithm */
+        kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+                          sizeof(one));
        result =
                sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
                                   O_NONBLOCK);
@@ -1011,6 +1031,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
                goto create_out;
        }
+        /* Turn off Nagle's algorithm */
+        kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+                          sizeof(one));
        result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
                                   (char *)&one, sizeof(one));
@@ -1297,6 +1321,7 @@ static void send_to_sock(struct connection *con)
        const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
        struct writequeue_entry *e;
        int len, offset;
+        int count = 0;
        mutex_lock(&con->sock_mutex);
        if (con->sock == NULL)
@@ -1319,14 +1344,27 @@ static void send_to_sock(struct connection *con)
                        ret = kernel_sendpage(con->sock, e->page, offset, len,
                                              msg_flags);
                        if (ret == -EAGAIN || ret == 0) {
+                                if (ret == -EAGAIN &&
+                                    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+                                    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+                                        /* Notify TCP that we're limited by the
+                                         * application window size.
+                                         */
+                                        set_bit(SOCK_NOSPACE, &con->sock->flags);
+                                        con->sock->sk->sk_write_pending++;
+                                }
                                cond_resched();
                                goto out;
                        }
                        if (ret <= 0)
                                goto send_error;
                }
-                        /* Don't starve people filling buffers */
+                /* Don't starve people filling buffers */
+                if (++count >= MAX_SEND_MSG_COUNT) {
                        cond_resched();
+                        count = 0;
+                }
                spin_lock(&con->writequeue_lock);
                e->offset += ret;
@@ -1430,20 +1468,19 @@ static void work_stop(void)
 static int work_start(void)
 {
-        int error;
+        recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
-        recv_workqueue = create_workqueue("dlm_recv");
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-        error = IS_ERR(recv_workqueue);
+        if (!recv_workqueue) {
-        if (error) {
+                log_print("can't start dlm_recv");
-                log_print("can't start dlm_recv %d", error);
+                return -ENOMEM;
-                return error;
        }
-        send_workqueue = create_singlethread_workqueue("dlm_send");
+        send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
-        error = IS_ERR(send_workqueue);
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-        if (error) {
+        if (!send_workqueue) {
-                log_print("can't start dlm_send %d", error);
+                log_print("can't start dlm_send");
                destroy_workqueue(recv_workqueue);
-                return error;
+                return -ENOMEM;
        }
        return 0;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d45c02db6943..30d8b85febbf 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -412,7 +412,8 @@ static const struct file_operations dev_fops = {
        .read    = dev_read,
        .write   = dev_write,
        .poll    = dev_poll,
-        .owner   = THIS_MODULE
+        .owner   = THIS_MODULE,
+        .llseek  = noop_llseek,
 };
 static struct miscdevice plock_dev_misc = {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b6272853130c..66d6c16bf440 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1009,6 +1009,7 @@ static const struct file_operations device_fops = {
        .write   = device_write,
        .poll    = device_poll,
        .owner   = THIS_MODULE,
+        .llseek  = noop_llseek,
 };
 static const struct file_operations ctl_device_fops = {
@@ -1017,6 +1018,7 @@ static const struct file_operations ctl_device_fops = {
        .read    = device_read,
        .write   = device_write,
        .owner   = THIS_MODULE,
+        .llseek  = noop_llseek,
 };
 static struct miscdevice ctl_device = {
@@ -1029,6 +1031,7 @@ static const struct file_operations monitor_device_fops = {
        .open    = monitor_device_open,
        .release = monitor_device_close,
        .owner   = THIS_MODULE,
+        .llseek  = noop_llseek,
 };
 static struct miscdevice monitor_device = {
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cbadc1bee6e7..bfd8b680e648 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
        BUG_ON(!crypt_stat || !crypt_stat->tfm
               || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n",
+                ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
                                crypt_stat->key_size);
                ecryptfs_dump_hex(crypt_stat->key,
                                  crypt_stat->key_size);
@@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
        rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
                                (extent_base + extent_offset));
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Error attempting to "
+                ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
-                                "derive IV for extent [0x%.16x]; "
+                        "extent [0x%.16llx]; rc = [%d]\n",
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        (unsigned long long)(extent_base + extent_offset), rc);
-                                rc);
                goto out;
        }
        if (unlikely(ecryptfs_verbosity > 0)) {
@@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
        }
        rc = 0;
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; "
+                ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        "rc = [%d]\n",
-                                rc);
+                        (unsigned long long)(extent_base + extent_offset), rc);
                ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
                                "encryption:\n");
                ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
@@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
        rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
                                (extent_base + extent_offset));
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Error attempting to "
+                ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
-                                "derive IV for extent [0x%.16x]; "
+                        "extent [0x%.16llx]; rc = [%d]\n",
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        (unsigned long long)(extent_base + extent_offset), rc);
-                                rc);
                goto out;
        }
        if (unlikely(ecryptfs_verbosity > 0)) {
@@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
        }
        rc = 0;
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; "
+                ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        "rc = [%d]\n",
-                                rc);
+                        (unsigned long long)(extent_base + extent_offset), rc);
                ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
                                "decryption:\n");
                ecryptfs_dump_hex((char *)(page_address(page)
@@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
        }
        ecryptfs_printk(KERN_DEBUG,
                        "Initializing cipher [%s]; strlen = [%d]; "
-                        "key_size_bits = [%d]\n",
+                        "key_size_bits = [%zd]\n",
                        crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
                        crypt_stat->key_size << 3);
        if (crypt_stat->tfm) {
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 906e803f7f79..6fc4f319b550 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -44,12 +44,17 @@
 */
 static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        struct dentry *lower_dentry;
-        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+        struct vfsmount *lower_mnt;
        struct dentry *dentry_save;
        struct vfsmount *vfsmount_save;
        int rc = 1;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
                goto out;
        dentry_save = nd->path.dentry;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 0032a9f5a3a9..dbc84ed96336 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
                (((struct user_key_payload*)key->payload.data)->data);
 }
-#define ECRYPTFS_SUPER_MAGIC 0xf15f
 #define ECRYPTFS_MAX_KEYSET_SIZE 1024
 #define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64
@@ -377,6 +376,7 @@ struct ecryptfs_mount_crypt_stat {
 #define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
 #define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
 #define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
+#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY    0x00000080
        u32 flags;
        struct list_head global_auth_tok_list;
        struct mutex global_auth_tok_list_mutex;
@@ -477,7 +477,7 @@ ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
 static inline struct ecryptfs_file_info *
 ecryptfs_file_to_private(struct file *file)
 {
-        return (struct ecryptfs_file_info *)file->private_data;
+        return file->private_data;
 }
 static inline void
@@ -583,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
 #define ecryptfs_printk(type, fmt, arg...) \
        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+__attribute__ ((format(printf, 1, 2)))
 void __ecryptfs_printk(const char *fmt, ...);
 extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 622c95140802..81e10e6a9443 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
-#include <linux/smp_lock.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -48,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
                                const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos)
 {
-        int rc;
+        ssize_t rc;
        struct dentry *lower_dentry;
        struct vfsmount *lower_vfsmount;
        struct file *file = iocb->ki_filp;
@@ -192,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                                      | ECRYPTFS_ENCRYPTED);
        }
        mutex_unlock(&crypt_stat->cs_mutex);
-        if (!ecryptfs_inode_to_private(inode)->lower_file) {
+        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        if (rc) {
-                if (rc) {
+                printk(KERN_ERR "%s: Error attempting to initialize "
-                        printk(KERN_ERR "%s: Error attempting to initialize "
+                        "the persistent file for the dentry with name "
-                               "the persistent file for the dentry with name "
+                        "[%s]; rc = [%d]\n", __func__,
-                               "[%s]; rc = [%d]\n", __func__,
+                        ecryptfs_dentry->d_name.name, rc);
-                               ecryptfs_dentry->d_name.name, rc);
+                goto out_free;
-                        goto out_free;
-                }
        }
-        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
-            && !(file->f_flags & O_RDONLY)) {
+            == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
                rc = -EPERM;
                printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
                       "file must hence be opened RO\n", __func__);
@@ -244,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                }
        }
        mutex_unlock(&crypt_stat->cs_mutex);
-        ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] "
+        ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
-                        "size: [0x%.16x]\n", inode, inode->i_ino,
+                        "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
-                        i_size_read(inode));
+                        (unsigned long long)i_size_read(inode));
        goto out;
 out_free:
        kmem_cache_free(ecryptfs_file_info_cache,
@@ -284,11 +281,9 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
        int rc = 0;
        struct file *lower_file = NULL;
-        lock_kernel();
        lower_file = ecryptfs_file_to_lower(file);
        if (lower_file->f_op && lower_file->f_op->fasync)
                rc = lower_file->f_op->fasync(fd, lower_file, flag);
-        unlock_kernel();
        return rc;
 }
@@ -332,6 +327,7 @@ const struct file_operations ecryptfs_dir_fops = {
        .fsync = ecryptfs_fsync,
        .fasync = ecryptfs_fasync,
        .splice_read = generic_file_splice_read,
+        .llseek = default_llseek,
 };
 const struct file_operations ecryptfs_main_fops = {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3fbc94203380..bd33f87a1907 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -70,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        struct dentry *dentry_save;
        struct vfsmount *vfsmount_save;
+        unsigned int flags_save;
        int rc;
        dentry_save = nd->path.dentry;
        vfsmount_save = nd->path.mnt;
+        flags_save = nd->flags;
        nd->path.dentry = lower_dentry;
        nd->path.mnt = lower_mnt;
+        nd->flags &= ~LOOKUP_OPEN;
        rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
        nd->path.dentry = dentry_save;
        nd->path.mnt = vfsmount_save;
+        nd->flags = flags_save;
        return rc;
 }
@@ -180,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
                                "context; rc = [%d]\n", rc);
                goto out;
        }
-        if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        if (rc) {
-                if (rc) {
+                printk(KERN_ERR "%s: Error attempting to initialize "
-                        printk(KERN_ERR "%s: Error attempting to initialize "
+                        "the persistent file for the dentry with name "
-                               "the persistent file for the dentry with name "
+                        "[%s]; rc = [%d]\n", __func__,
-                               "[%s]; rc = [%d]\n", __func__,
+                        ecryptfs_dentry->d_name.name, rc);
-                               ecryptfs_dentry->d_name.name, rc);
+                goto out;
-                        goto out;
-                }
        }
        rc = ecryptfs_write_metadata(ecryptfs_dentry);
        if (rc) {
@@ -255,7 +258,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                   ecryptfs_dentry->d_parent));
        lower_inode = lower_dentry->d_inode;
        fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
-        BUG_ON(!atomic_read(&lower_dentry->d_count));
+        BUG_ON(!lower_dentry->d_count);
        ecryptfs_set_dentry_private(ecryptfs_dentry,
                                    kmem_cache_alloc(ecryptfs_dentry_info_cache,
                                                     GFP_KERNEL));
@@ -297,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                rc = -ENOMEM;
                goto out;
        }
-        if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        if (rc) {
-                if (rc) {
+                printk(KERN_ERR "%s: Error attempting to initialize "
-                        printk(KERN_ERR "%s: Error attempting to initialize "
+                        "the persistent file for the dentry with name "
-                               "the persistent file for the dentry with name "
+                        "[%s]; rc = [%d]\n", __func__,
-                               "[%s]; rc = [%d]\n", __func__,
+                        ecryptfs_dentry->d_name.name, rc);
-                               ecryptfs_dentry->d_name.name, rc);
+                goto out_free_kmem;
-                        goto out_free_kmem;
-                }
        }
        crypt_stat = &ecryptfs_inode_to_private(
                                        ecryptfs_dentry->d_inode)->crypt_stat;
@@ -436,7 +437,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        struct qstr lower_name;
        int rc = 0;
-        ecryptfs_dentry->d_op = &ecryptfs_dops;
        if ((ecryptfs_dentry->d_name.len == 1
             && !strcmp(ecryptfs_dentry->d_name.name, "."))
            || (ecryptfs_dentry->d_name.len == 2
@@ -449,7 +449,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        lower_name.hash = ecryptfs_dentry->d_name.hash;
        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-                                                    &lower_name);
+                                lower_dir_dentry->d_inode, &lower_name);
                if (rc < 0)
                        goto out_d_drop;
        }
@@ -484,7 +484,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-                                                    &lower_name);
+                                lower_dir_dentry->d_inode, &lower_name);
                if (rc < 0)
                        goto out_d_drop;
        }
@@ -975,8 +975,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 }
 static int
-ecryptfs_permission(struct inode *inode, int mask)
+ecryptfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return inode_permission(ecryptfs_inode_to_lower(inode), mask);
 }
@@ -1108,10 +1110,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                rc = -EOPNOTSUPP;
                goto out;
        }
-        mutex_lock(&lower_dentry->d_inode->i_mutex);
-        rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value,
+        rc = vfs_setxattr(lower_dentry, name, value, size, flags);
-                                                   size, flags);
-        mutex_unlock(&lower_dentry->d_inode->i_mutex);
 out:
        return rc;
 }
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 73811cfa2ea4..c1436cff6f2d 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -59,7 +59,7 @@ static int process_request_key_err(long err_code)
                break;
        default:
                ecryptfs_printk(KERN_WARNING, "Unknown error code: "
-                                "[0x%.16x]\n", err_code);
+                                "[0x%.16lx]\n", err_code);
                rc = -EINVAL;
        }
        return rc;
@@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
        } else {
                rc = -EINVAL;
                ecryptfs_printk(KERN_WARNING,
-                                "Unsupported packet size: [%d]\n", size);
+                                "Unsupported packet size: [%zd]\n", size);
        }
        return rc;
 }
@@ -446,6 +446,7 @@ out:
 */
 static int
 ecryptfs_find_auth_tok_for_sig(
+        struct key **auth_tok_key,
        struct ecryptfs_auth_tok **auth_tok,
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
        char *sig)
@@ -453,12 +454,21 @@ ecryptfs_find_auth_tok_for_sig(
        struct ecryptfs_global_auth_tok *global_auth_tok;
        int rc = 0;
+        (*auth_tok_key) = NULL;
        (*auth_tok) = NULL;
        if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
                                                  mount_crypt_stat, sig)) {
-                struct key *auth_tok_key;
-                rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+                /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
+                 * mount_crypt_stat structure, we prevent to use auth toks that
+                 * are not inserted through the ecryptfs_add_global_auth_tok
+                 * function.
+                 */
+                if (mount_crypt_stat->flags
+                                & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+                        return -EINVAL;
+                rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
                                                       sig);
        } else
                (*auth_tok) = global_auth_tok->global_auth_tok;
@@ -509,6 +519,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                             char *filename, size_t filename_size)
 {
        struct ecryptfs_write_tag_70_packet_silly_stack *s;
+        struct key *auth_tok_key = NULL;
        int rc = 0;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
@@ -606,6 +617,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        }
        dest[s->i++] = s->cipher_code;
        rc = ecryptfs_find_auth_tok_for_sig(
+                &auth_tok_key,
                &s->auth_tok, mount_crypt_stat,
                mount_crypt_stat->global_default_fnek_sig);
        if (rc) {
@@ -753,6 +765,8 @@ out_free_unlock:
 out_unlock:
        mutex_unlock(s->tfm_mutex);
 out:
+        if (auth_tok_key)
+                key_put(auth_tok_key);
        kfree(s);
        return rc;
 }
@@ -798,6 +812,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                             char *data, size_t max_packet_size)
 {
        struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+        struct key *auth_tok_key = NULL;
        int rc = 0;
        (*packet_size) = 0;
@@ -910,7 +925,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
         * >= ECRYPTFS_MAX_IV_BYTES. */
        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
        s->desc.info = s->iv;
-        rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+        rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+                                            &s->auth_tok, mount_crypt_stat,
                                            s->fnek_sig_hex);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to find auth tok for "
@@ -986,6 +1002,8 @@ out:
                (*filename_size) = 0;
                (*filename) = NULL;
        }
+        if (auth_tok_key)
+                key_put(auth_tok_key);
        kfree(s);
        return rc;
 }
@@ -1557,14 +1575,19 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
                       ECRYPTFS_VERSION_MAJOR,
                       ECRYPTFS_VERSION_MINOR);
                rc = -EINVAL;
-                goto out;
+                goto out_release_key;
        }
        if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
            && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
                printk(KERN_ERR "Invalid auth_tok structure "
                       "returned from key query\n");
                rc = -EINVAL;
-                goto out;
+                goto out_release_key;
+        }
+out_release_key:
+        if (rc) {
+                key_put(*auth_tok_key);
+                (*auth_tok_key) = NULL;
        }
 out:
        return rc;
@@ -1649,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
               auth_tok->session_key.decrypted_key_size);
        crypt_stat->flags |= ECRYPTFS_KEY_VALID;
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n",
+                ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
                                crypt_stat->key_size);
                ecryptfs_dump_hex(crypt_stat->key,
                                  crypt_stat->key_size);
@@ -1688,6 +1711,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
        struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
        size_t tag_11_contents_size;
        size_t tag_11_packet_size;
+        struct key *auth_tok_key = NULL;
        int rc = 0;
        INIT_LIST_HEAD(&auth_tok_list);
@@ -1730,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
                        if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
                                ecryptfs_printk(KERN_ERR, "Expected "
                                                "signature of size [%d]; "
-                                                "read size [%d]\n",
+                                                "read size [%zd]\n",
                                                ECRYPTFS_SIG_SIZE,
                                                tag_11_contents_size);
                                rc = -EIO;
@@ -1763,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
                        goto out_wipe_list;
                        break;
                default:
-                        ecryptfs_printk(KERN_DEBUG, "No packet at offset "
+                        ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
-                                        "[%d] of the file header; hex value of "
+                                        "of the file header; hex value of "
                                        "character is [0x%.2x]\n", i, src[i]);
                        next_packet_is_auth_tok_packet = 0;
                }
@@ -1784,6 +1808,10 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
         * just one will be sufficient to decrypt to get the FEK. */
 find_next_matching_auth_tok:
        found_auth_tok = 0;
+        if (auth_tok_key) {
+                key_put(auth_tok_key);
+                auth_tok_key = NULL;
+        }
        list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
                candidate_auth_tok = &auth_tok_list_item->auth_tok;
                if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1800,10 +1828,11 @@ find_next_matching_auth_tok:
                        rc = -EINVAL;
                        goto out_wipe_list;
                }
-                ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+                rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+                                               &matching_auth_tok,
                                               crypt_stat->mount_crypt_stat,
                                               candidate_auth_tok_sig);
-                if (matching_auth_tok) {
+                if (!rc) {
                        found_auth_tok = 1;
                        goto found_matching_auth_tok;
                }
@@ -1835,8 +1864,8 @@ found_matching_auth_tok:
                                "session key for authentication token with sig "
                                "[%.*s]; rc = [%d]. Removing auth tok "
                                "candidate from the list and searching for "
-                                "the next match.\n", candidate_auth_tok_sig,
+                                "the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
-                                ECRYPTFS_SIG_SIZE_HEX, rc);
+                                candidate_auth_tok_sig, rc);
                list_for_each_entry_safe(auth_tok_list_item,
                                         auth_tok_list_item_tmp,
                                         &auth_tok_list, list) {
@@ -1866,6 +1895,8 @@ found_matching_auth_tok:
 out_wipe_list:
        wipe_auth_tok_list(&auth_tok_list);
 out:
+        if (auth_tok_key)
+                key_put(auth_tok_key);
        return rc;
 }
@@ -2137,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
        if (encrypted_session_key_valid) {
                ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
                                "using auth_tok->session_key.encrypted_key, "
-                                "where key_rec->enc_key_size = [%d]\n",
+                                "where key_rec->enc_key_size = [%zd]\n",
                                key_rec->enc_key_size);
                memcpy(key_rec->enc_key,
                       auth_tok->session_key.encrypted_key,
@@ -2167,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
        if (rc < 1 || rc > 2) {
                ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
                                "for crypt_stat session key; expected rc = 1; "
-                                "got rc = [%d]. key_rec->enc_key_size = [%d]\n",
+                                "got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
                                rc, key_rec->enc_key_size);
                rc = -ENOMEM;
                goto out;
@@ -2178,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
                ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
                                "for crypt_stat encrypted session key; "
                                "expected rc = 1; got rc = [%d]. "
-                                "key_rec->enc_key_size = [%d]\n", rc,
+                                "key_rec->enc_key_size = [%zd]\n", rc,
                                key_rec->enc_key_size);
                rc = -ENOMEM;
                goto out;
@@ -2193,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
                goto out;
        }
        rc = 0;
-        ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
+        ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
                        crypt_stat->key_size);
        rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
                                      (*key_rec).enc_key_size);
@@ -2204,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
        }
        ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
        if (ecryptfs_verbosity > 0) {
-                ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n",
+                ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
                                key_rec->enc_key_size);
                ecryptfs_dump_hex(key_rec->enc_key,
                                  key_rec->enc_key_size);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cbd4e18adb20..758323a0f09a 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/magic.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -141,25 +142,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
        return rc;
 }
-/**
+static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
- * ecryptfs_interpose
+                       struct super_block *sb)
- * @lower_dentry: Existing dentry in the lower filesystem
- * @dentry: ecryptfs' dentry
- * @sb: ecryptfs's super_block
- * @flags: flags to govern behavior of interpose procedure
- *
- * Interposes upper and lower dentries.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
-                       struct super_block *sb, u32 flags)
 {
-        struct inode *lower_inode;
        struct inode *inode;
        int rc = 0;
-        lower_inode = lower_dentry->d_inode;
        if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
                rc = -EXDEV;
                goto out;
@@ -189,17 +177,38 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
        if (special_file(lower_inode->i_mode))
                init_special_inode(inode, lower_inode->i_mode,
                                   lower_inode->i_rdev);
-        dentry->d_op = &ecryptfs_dops;
        fsstack_copy_attr_all(inode, lower_inode);
        /* This size will be overwritten for real files w/ headers and
         * other metadata */
        fsstack_copy_inode_size(inode, lower_inode);
+        return inode;
+out:
+        return ERR_PTR(rc);
+}
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ * @flags: flags to govern behavior of interpose procedure
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
+                       struct super_block *sb, u32 flags)
+{
+        struct inode *lower_inode = lower_dentry->d_inode;
+        struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
                d_add(dentry, inode);
        else
                d_instantiate(dentry, inode);
-out:
+        return 0;
-        return rc;
 }
 enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
@@ -208,7 +217,8 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
-       ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
+       ecryptfs_opt_err };
 static const match_table_t tokens = {
        {ecryptfs_opt_sig, "sig=%s"},
@@ -223,6 +233,7 @@ static const match_table_t tokens = {
        {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
        {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
        {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
+        {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
        {ecryptfs_opt_err, NULL}
 };
@@ -406,6 +417,10 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
                case ecryptfs_opt_unlink_sigs:
                        mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
                        break;
+                case ecryptfs_opt_mount_auth_tok_only:
+                        mount_crypt_stat->flags |=
+                                ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
+                        break;
                case ecryptfs_opt_err:
                default:
                        printk(KERN_WARNING
@@ -486,68 +501,21 @@ struct kmem_cache *ecryptfs_sb_info_cache;
 static struct file_system_type ecryptfs_fs_type;
 /**
- * ecryptfs_read_super
- * @sb: The ecryptfs super block
- * @dev_name: The path to mount over
- *
- * Read the super block of the lower filesystem, and use
- * ecryptfs_interpose to create our initial inode and super block
- * struct.
- */
-static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
-{
-        struct path path;
-        int rc;
-        rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
-        if (rc) {
-                ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
-                goto out;
-        }
-        if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
-                rc = -EINVAL;
-                printk(KERN_ERR "Mount on filesystem of type "
-                        "eCryptfs explicitly disallowed due to "
-                        "known incompatibilities\n");
-                goto out_free;
-        }
-        ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
-        sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
-        sb->s_blocksize = path.dentry->d_sb->s_blocksize;
-        ecryptfs_set_dentry_lower(sb->s_root, path.dentry);
-        ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt);
-        rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0);
-        if (rc)
-                goto out_free;
-        rc = 0;
-        goto out;
-out_free:
-        path_put(&path);
-out:
-        return rc;
-}
-/**
 * ecryptfs_get_sb
 * @fs_type
 * @flags
 * @dev_name: The path to mount over
 * @raw_data: The options passed into the kernel
- *
- * The whole ecryptfs_get_sb process is broken into 3 functions:
- * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
- * ecryptfs_read_super(): this accesses the lower filesystem and uses
- *                        ecryptfs_interpose to perform most of the linking
- * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
 */
-static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
-                        const char *dev_name, void *raw_data,
+                        const char *dev_name, void *raw_data)
-                        struct vfsmount *mnt)
 {
        struct super_block *s;
        struct ecryptfs_sb_info *sbi;
        struct ecryptfs_dentry_info *root_info;
        const char *err = "Getting sb failed";
+        struct inode *inode;
+        struct path path;
        int rc;
        sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -570,10 +538,8 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
        s->s_flags = flags;
        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
-        if (rc) {
+        if (rc)
-                deactivate_locked_super(s);
+                goto out1;
-                goto out;
-        }
        ecryptfs_set_superblock_private(s, sbi);
        s->s_bdi = &sbi->bdi;
@@ -581,42 +547,62 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
        /* ->kill_sb() will take care of sbi after that point */
        sbi = NULL;
        s->s_op = &ecryptfs_sops;
+        s->s_d_op = &ecryptfs_dops;
-        rc = -ENOMEM;
+        err = "Reading sb failed";
-        s->s_root = d_alloc(NULL, &(const struct qstr) {
+        rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
-                             .hash = 0,.name = "/",.len = 1});
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "kern_path() failed\n");
+                goto out1;
+        }
+        if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Mount on filesystem of type "
+                        "eCryptfs explicitly disallowed due to "
+                        "known incompatibilities\n");
+                goto out_free;
+        }
+        ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
+        s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
+        s->s_blocksize = path.dentry->d_sb->s_blocksize;
+        s->s_magic = ECRYPTFS_SUPER_MAGIC;
+        inode = ecryptfs_get_inode(path.dentry->d_inode, s);
+        rc = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_free;
+        s->s_root = d_alloc_root(inode);
        if (!s->s_root) {
-                deactivate_locked_super(s);
+                iput(inode);
-                goto out;
+                rc = -ENOMEM;
+                goto out_free;
        }
-        s->s_root->d_op = &ecryptfs_dops;
-        s->s_root->d_sb = s;
-        s->s_root->d_parent = s->s_root;
+        rc = -ENOMEM;
        root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
-        if (!root_info) {
+        if (!root_info)
-                deactivate_locked_super(s);
+                goto out_free;
-                goto out;
-        }
        /* ->kill_sb() will take care of root_info */
        ecryptfs_set_dentry_private(s->s_root, root_info);
+        ecryptfs_set_dentry_lower(s->s_root, path.dentry);
+        ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt);
        s->s_flags |= MS_ACTIVE;
-        rc = ecryptfs_read_super(s, dev_name);
+        return dget(s->s_root);
-        if (rc) {
-                deactivate_locked_super(s);
-                err = "Reading sb failed";
-                goto out;
-        }
-        simple_set_mnt(mnt, s);
-        return 0;
+out_free:
+        path_put(&path);
+out1:
+        deactivate_locked_super(s);
 out:
        if (sbi) {
                ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
                kmem_cache_free(ecryptfs_sb_info_cache, sbi);
        }
        printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
-        return rc;
+        return ERR_PTR(rc);
 }
 /**
@@ -639,7 +625,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
 static struct file_system_type ecryptfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "ecryptfs",
-        .get_sb = ecryptfs_get_sb,
+        .mount = ecryptfs_mount,
        .kill_sb = ecryptfs_kill_block_super,
        .fs_flags = 0
 };
@@ -824,9 +810,10 @@ static int __init ecryptfs_init(void)
                ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
                                "larger than the host's page size, and so "
                                "eCryptfs cannot run on this system. The "
-                                "default eCryptfs extent size is [%d] bytes; "
+                                "default eCryptfs extent size is [%u] bytes; "
-                                "the page size is [%d] bytes.\n",
+                                "the page size is [%lu] bytes.\n",
-                                ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE);
+                                ECRYPTFS_DEFAULT_EXTENT_SIZE,
+                                (unsigned long)PAGE_CACHE_SIZE);
                goto out;
        }
        rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 00208c3d7e92..940a82e63dc3 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -482,6 +482,7 @@ static const struct file_operations ecryptfs_miscdev_fops = {
        .read    = ecryptfs_miscdev_read,
        .write   = ecryptfs_miscdev_write,
        .release = ecryptfs_miscdev_release,
+        .llseek  = noop_llseek,
 };
 static struct miscdevice ecryptfs_miscdev = {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b1d82756544b..cc64fca89f8d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
        rc = ecryptfs_encrypt_page(page);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error encrypting "
-                                "page (upper index [0x%.16x])\n", page->index);
+                                "page (upper index [0x%.16lx])\n", page->index);
                ClearPageUptodate(page);
                goto out;
        }
@@ -237,7 +237,7 @@ out:
                ClearPageUptodate(page);
        else
                SetPageUptodate(page);
-        ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+        ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
                        page->index);
        unlock_page(page);
        return rc;
@@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file,
                return -ENOMEM;
        *pagep = page;
+        prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
        if (!PageUptodate(page)) {
                struct ecryptfs_crypt_stat *crypt_stat =
                        &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
@@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file,
                                SetPageUptodate(page);
                        }
                } else {
-                        rc = ecryptfs_decrypt_page(page);
+                        if (prev_page_end_size
-                        if (rc) {
+                            >= i_size_read(page->mapping->host)) {
-                                printk(KERN_ERR "%s: Error decrypting page "
+                                zero_user(page, 0, PAGE_CACHE_SIZE);
-                                       "at index [%ld]; rc = [%d]\n",
+                        } else {
-                                       __func__, page->index, rc);
+                                rc = ecryptfs_decrypt_page(page);
-                                ClearPageUptodate(page);
+                                if (rc) {
-                                goto out;
+                                        printk(KERN_ERR "%s: Error decrypting "
+                                               "page at index [%ld]; "
+                                               "rc = [%d]\n",
+                                               __func__, page->index, rc);
+                                        ClearPageUptodate(page);
+                                        goto out;
+                                }
                        }
                        SetPageUptodate(page);
                }
        }
-        prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
        /* If creating a page or more of holes, zero them out via truncate.
         * Note, this will increase i_size. */
        if (index != 0) {
@@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file,
        } else
                ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
        ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
-                        "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
+                        "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
        if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
                rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
                                                       to);
@@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file,
        rc = fill_zeros_to_end_of_page(page, to);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
-                        "zeros in page with index = [0x%.16x]\n", index);
+                        "zeros in page with index = [0x%.16lx]\n", index);
                goto out;
        }
        rc = ecryptfs_encrypt_page(page);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
-                                "index [0x%.16x])\n", index);
+                                "index [0x%.16lx])\n", index);
                goto out;
        }
        if (pos + copied > i_size_read(ecryptfs_inode)) {
                i_size_write(ecryptfs_inode, pos + copied);
                ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
-                                "[0x%.16x]\n", i_size_read(ecryptfs_inode));
+                        "[0x%.16llx]\n",
+                        (unsigned long long)i_size_read(ecryptfs_inode));
        }
        rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
        if (rc)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index f7fc286a3aa9..3042fe123a34 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/key.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include "ecryptfs_kernel.h"
@@ -63,6 +62,16 @@ out:
        return inode;
 }
+static void ecryptfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ecryptfs_inode_info *inode_info;
+        inode_info = ecryptfs_inode_to_private(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
 /**
 * ecryptfs_destroy_inode
 * @inode: The ecryptfs inode
@@ -89,7 +98,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
                }
        }
        ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
-        kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+        call_rcu(&inode->i_rcu, ecryptfs_i_callback);
 }
 /**
@@ -180,6 +189,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_printf(m, ",ecryptfs_encrypted_view");
        if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
                seq_printf(m, ",ecryptfs_unlink_sigs");
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+                seq_printf(m, ",ecryptfs_mount_auth_tok_only");
        return 0;
 }
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f04942810818..0f31acb0131c 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -20,16 +20,16 @@
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
-static int efs_get_sb(struct file_system_type *fs_type,
+static struct dentry *efs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
 }
 static struct file_system_type efs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "efs",
-        .get_sb         = efs_get_sb,
+        .mount          = efs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -65,11 +65,18 @@ static struct inode *efs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void efs_destroy_inode(struct inode *inode)
+static void efs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
+static void efs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, efs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct efs_inode_info *ei = (struct efs_inode_info *) foo;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6bd3f76fdf88..e0194b3e14d6 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -293,6 +293,7 @@ static const struct file_operations eventfd_fops = {
        .poll           = eventfd_poll,
        .read           = eventfd_read,
        .write          = eventfd_write,
+        .llseek         = noop_llseek,
 };
 /**
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3817149919cb..267d0ada4541 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -77,9 +77,6 @@
 /* Maximum number of nesting allowed inside epoll sets */
 #define EP_MAX_NESTS 4
-/* Maximum msec timeout value storeable in a long int */
-#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
 #define EP_UNACTIVE_PTR ((void *) -1L)
@@ -220,7 +217,7 @@ struct ep_send_events_data {
 * Configuration options available inside /proc/sys/fs/epoll/
 */
 /* Maximum number of epoll watched descriptors, per user */
-static int max_user_watches __read_mostly;
+static long max_user_watches __read_mostly;
 /*
 * This mutex is used to serialize ep_free() and eventpoll_release_file().
@@ -243,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly;
 #include <linux/sysctl.h>
-static int zero;
+static long zero;
+static long long_max = LONG_MAX;
 ctl_table epoll_table[] = {
        {
                .procname       = "max_user_watches",
                .data           = &max_user_watches,
-                .maxlen         = sizeof(int),
+                .maxlen         = sizeof(max_user_watches),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
                .extra1         = &zero,
+                .extra2         = &long_max,
        },
        { }
 };
@@ -564,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
        /* At this point it is safe to free the eventpoll item */
        kmem_cache_free(epi_cache, epi);
-        atomic_dec(&ep->user->epoll_watches);
+        atomic_long_dec(&ep->user->epoll_watches);
        return 0;
 }
@@ -674,7 +673,8 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 /* File callbacks that implement the eventpoll file behaviour */
 static const struct file_operations eventpoll_fops = {
        .release        = ep_eventpoll_release,
-        .poll           = ep_eventpoll_poll
+        .poll           = ep_eventpoll_poll,
+        .llseek         = noop_llseek,
 };
 /* Fast test to see if the file is an evenpoll file */
@@ -900,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 {
        int error, revents, pwake = 0;
        unsigned long flags;
+        long user_watches;
        struct epitem *epi;
        struct ep_pqueue epq;
-        if (unlikely(atomic_read(&ep->user->epoll_watches) >=
+        user_watches = atomic_long_read(&ep->user->epoll_watches);
-                     max_user_watches))
+        if (unlikely(user_watches >= max_user_watches))
                return -ENOSPC;
        if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
                return -ENOMEM;
@@ -968,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
        spin_unlock_irqrestore(&ep->lock, flags);
-        atomic_inc(&ep->user->epoll_watches);
+        atomic_long_inc(&ep->user->epoll_watches);
        /* We have to call this outside the lock */
        if (pwake)
@@ -1113,21 +1114,35 @@ static int ep_send_events(struct eventpoll *ep,
        return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
 }
+static inline struct timespec ep_set_mstimeout(long ms)
+{
+        struct timespec now, ts = {
+                .tv_sec = ms / MSEC_PER_SEC,
+                .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
+        };
+        ktime_get_ts(&now);
+        return timespec_add_safe(now, ts);
+}
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, long timeout)
 {
-        int res, eavail;
+        int res, eavail, timed_out = 0;
        unsigned long flags;
-        long jtimeout;
+        long slack;
        wait_queue_t wait;
+        ktime_t expires, *to = NULL;
-        /*
+        if (timeout > 0) {
-         * Calculate the timeout by checking for the "infinite" value (-1)
+                struct timespec end_time = ep_set_mstimeout(timeout);
-         * and the overflow condition. The passed timeout is in milliseconds,
-         * that why (t * HZ) / 1000.
+                slack = select_estimate_accuracy(&end_time);
-         */
+                to = &expires;
-        jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
+                *to = timespec_to_ktime(end_time);
-                MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
+        } else if (timeout == 0) {
+                timed_out = 1;
+        }
 retry:
        spin_lock_irqsave(&ep->lock, flags);
@@ -1149,7 +1164,7 @@ retry:
                         * to TASK_INTERRUPTIBLE before doing the checks.
                         */
                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (!list_empty(&ep->rdllist) || !jtimeout)
+                        if (!list_empty(&ep->rdllist) || timed_out)
                                break;
                        if (signal_pending(current)) {
                                res = -EINTR;
@@ -1157,7 +1172,9 @@ retry:
                        }
                        spin_unlock_irqrestore(&ep->lock, flags);
-                        jtimeout = schedule_timeout(jtimeout);
+                        if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                                timed_out = 1;
                        spin_lock_irqsave(&ep->lock, flags);
                }
                __remove_wait_queue(&ep->wq, &wait);
@@ -1175,7 +1192,7 @@ retry:
         * more luck.
         */
        if (!res && eavail &&
-            !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
+            !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
                goto retry;
        return res;
@@ -1422,6 +1439,7 @@ static int __init eventpoll_init(void)
         */
        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
+        BUG_ON(max_user_watches < 0);
        /* Initialize the structure used to perform safe poll wait head wake ups */
        ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/exec.c b/fs/exec.c
index 6d2b6f936858..52a447d9b6ab 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -65,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
 int suid_dumpable = 0;
+struct core_name {
+        char *corename;
+        int used, size;
+};
+static atomic_t call_count = ATOMIC_INIT(1);
 /* The maximal length of core_pattern is also specified in sysctl.c */
 static LIST_HEAD(formats);
@@ -113,7 +120,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
                goto out;
        file = do_filp_open(AT_FDCWD, tmp,
-                                O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
+                                O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
                                MAY_READ | MAY_EXEC | MAY_OPEN);
        putname(tmp);
        error = PTR_ERR(file);
@@ -157,7 +164,26 @@ out:
 #ifdef CONFIG_MMU
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+        struct mm_struct *mm = current->mm;
+        long diff = (long)(pages - bprm->vma_pages);
+        if (!mm || !diff)
+                return;
+        bprm->vma_pages = pages;
+#ifdef SPLIT_RSS_COUNTING
+        add_mm_counter(mm, MM_ANONPAGES, diff);
+#else
+        spin_lock(&mm->page_table_lock);
+        add_mm_counter(mm, MM_ANONPAGES, diff);
+        spin_unlock(&mm->page_table_lock);
+#endif
+}
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -179,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
                struct rlimit *rlim;
+                acct_arg_size(bprm, size / PAGE_SIZE);
                /*
                 * We've historically supported up to 32 pages (ARG_MAX)
                 * of argument strings even with small stacks
@@ -247,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        INIT_LIST_HEAD(&vma->anon_vma_chain);
+        err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+        if (err)
+                goto err;
        err = insert_vm_struct(mm, vma);
        if (err)
                goto err;
@@ -269,7 +302,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
 #else
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+}
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -686,7 +723,7 @@ struct file *open_exec(const char *name)
        int err;
        file = do_filp_open(AT_FDCWD, name,
-                                O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
+                                O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
                                MAY_EXEC | MAY_OPEN);
        if (IS_ERR(file))
                goto out;
@@ -759,6 +796,10 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->mm = mm;
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
+        if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+                atomic_dec(&old_mm->oom_disable_count);
+                atomic_inc(&tsk->mm->oom_disable_count);
+        }
        task_unlock(tsk);
        arch_pick_mmap_layout(mm);
        if (old_mm) {
@@ -992,13 +1033,14 @@ int flush_old_exec(struct linux_binprm * bprm)
        /*
         * Release all of the old mmap stuff
         */
+        acct_arg_size(bprm, 0);
        retval = exec_mmap(bprm->mm);
        if (retval)
                goto out;
        bprm->mm = NULL;                /* We're using it now */
-        current->flags &= ~PF_RANDOMIZE;
+        current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
        flush_thread();
        current->personality &= ~bprm->per_clear;
@@ -1078,14 +1120,14 @@ EXPORT_SYMBOL(setup_new_exec);
 */
 int prepare_bprm_creds(struct linux_binprm *bprm)
 {
-        if (mutex_lock_interruptible(&current->cred_guard_mutex))
+        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
                return -ERESTARTNOINTR;
        bprm->cred = prepare_exec_creds();
        if (likely(bprm->cred))
                return 0;
-        mutex_unlock(&current->cred_guard_mutex);
+        mutex_unlock(&current->signal->cred_guard_mutex);
        return -ENOMEM;
 }
@@ -1093,7 +1135,7 @@ void free_bprm(struct linux_binprm *bprm)
 {
        free_arg_pages(bprm);
        if (bprm->cred) {
-                mutex_unlock(&current->cred_guard_mutex);
+                mutex_unlock(&current->signal->cred_guard_mutex);
                abort_creds(bprm->cred);
        }
        kfree(bprm);
@@ -1114,13 +1156,13 @@ void install_exec_creds(struct linux_binprm *bprm)
         * credentials; any time after this it may be unlocked.
         */
        security_bprm_committed_creds(bprm);
-        mutex_unlock(&current->cred_guard_mutex);
+        mutex_unlock(&current->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
 /*
 * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_guard_mutex to protect against
+ * - the caller must hold ->cred_guard_mutex to protect against
 *   PTRACE_ATTACH
 */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1401,7 +1443,6 @@ int do_execve(const char * filename,
        if (retval < 0)
                goto out;
-        current->flags &= ~PF_KTHREAD;
        retval = search_binary_handler(bprm,regs);
        if (retval < 0)
                goto out;
@@ -1416,8 +1457,10 @@ int do_execve(const char * filename,
        return retval;
 out:
-        if (bprm->mm)
+        if (bprm->mm) {
-                mmput (bprm->mm);
+                acct_arg_size(bprm, 0);
+                mmput(bprm->mm);
+        }
 out_file:
        if (bprm->file) {
@@ -1454,127 +1497,148 @@ void set_binfmt(struct linux_binfmt *new)
 EXPORT_SYMBOL(set_binfmt);
+static int expand_corename(struct core_name *cn)
+{
+        char *old_corename = cn->corename;
+        cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+        cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+        if (!cn->corename) {
+                kfree(old_corename);
+                return -ENOMEM;
+        }
+        return 0;
+}
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+        char *cur;
+        int need;
+        int ret;
+        va_list arg;
+        va_start(arg, fmt);
+        need = vsnprintf(NULL, 0, fmt, arg);
+        va_end(arg);
+        if (likely(need < cn->size - cn->used - 1))
+                goto out_printf;
+        ret = expand_corename(cn);
+        if (ret)
+                goto expand_fail;
+out_printf:
+        cur = cn->corename + cn->used;
+        va_start(arg, fmt);
+        vsnprintf(cur, need + 1, fmt, arg);
+        va_end(arg);
+        cn->used += need;
+        return 0;
+expand_fail:
+        return ret;
+}
 /* format_corename will inspect the pattern parameter, and output a
 * name into corename, which must have space for at least
 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
 */
-static int format_corename(char *corename, long signr)
+static int format_corename(struct core_name *cn, long signr)
 {
        const struct cred *cred = current_cred();
        const char *pat_ptr = core_pattern;
        int ispipe = (*pat_ptr == '|');
-        char *out_ptr = corename;
-        char *const out_end = corename + CORENAME_MAX_SIZE;
-        int rc;
        int pid_in_pattern = 0;
+        int err = 0;
+        cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
+        cn->corename = kmalloc(cn->size, GFP_KERNEL);
+        cn->used = 0;
+        if (!cn->corename)
+                return -ENOMEM;
        /* Repeat as long as we have more pattern to process and more output
           space */
        while (*pat_ptr) {
                if (*pat_ptr != '%') {
-                        if (out_ptr == out_end)
+                        if (*pat_ptr == 0)
                                goto out;
-                        *out_ptr++ = *pat_ptr++;
+                        err = cn_printf(cn, "%c", *pat_ptr++);
                } else {
                        switch (*++pat_ptr) {
+                        /* single % at the end, drop that */
                        case 0:
                                goto out;
                        /* Double percent, output one percent */
                        case '%':
-                                if (out_ptr == out_end)
+                                err = cn_printf(cn, "%c", '%');
-                                        goto out;
-                                *out_ptr++ = '%';
                                break;
                        /* pid */
                        case 'p':
                                pid_in_pattern = 1;
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%d",
-                                              "%d", task_tgid_vnr(current));
+                                              task_tgid_vnr(current));
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* uid */
                        case 'u':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%d", cred->uid);
-                                              "%d", cred->uid);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* gid */
                        case 'g':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%d", cred->gid);
-                                              "%d", cred->gid);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* signal that caused the coredump */
                        case 's':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%ld", signr);
-                                              "%ld", signr);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* UNIX time of coredump */
                        case 't': {
                                struct timeval tv;
                                do_gettimeofday(&tv);
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%lu", tv.tv_sec);
-                                              "%lu", tv.tv_sec);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        }
                        /* hostname */
                        case 'h':
                                down_read(&uts_sem);
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%s",
-                                              "%s", utsname()->nodename);
+                                              utsname()->nodename);
                                up_read(&uts_sem);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* executable */
                        case 'e':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%s", current->comm);
-                                              "%s", current->comm);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* core limit size */
                        case 'c':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%lu",
-                                              "%lu", rlimit(RLIMIT_CORE));
+                                              rlimit(RLIMIT_CORE));
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        default:
                                break;
                        }
                        ++pat_ptr;
                }
+                if (err)
+                        return err;
        }
        /* Backward compatibility with core_uses_pid:
         *
         * If core_pattern does not include a %p (as is the default)
         * and core_uses_pid is set, then .%pid will be appended to
         * the filename. Do not do this for piped commands. */
        if (!ispipe && !pid_in_pattern && core_uses_pid) {
-                rc = snprintf(out_ptr, out_end - out_ptr,
+                err = cn_printf(cn, ".%d", task_tgid_vnr(current));
-                              ".%d", task_tgid_vnr(current));
+                if (err)
-                if (rc > out_end - out_ptr)
+                        return err;
-                        goto out;
-                out_ptr += rc;
        }
 out:
-        *out_ptr = 0;
        return ispipe;
 }
@@ -1851,7 +1915,7 @@ static int umh_pipe_setup(struct subprocess_info *info)
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
-        char corename[CORENAME_MAX_SIZE + 1];
+        struct core_name cn;
        struct mm_struct *mm = current->mm;
        struct linux_binfmt * binfmt;
        const struct cred *old_cred;
@@ -1906,7 +1970,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
         */
        clear_thread_flag(TIF_SIGPENDING);
-        ispipe = format_corename(corename, signr);
+        ispipe = format_corename(&cn, signr);
+        if (ispipe == -ENOMEM) {
+                printk(KERN_WARNING "format_corename failed\n");
+                printk(KERN_WARNING "Aborting core\n");
+                goto fail_corename;
+        }
        if (ispipe) {
                int dump_count;
@@ -1943,7 +2013,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                        goto fail_dropcount;
                }
-                helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
+                helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
                if (!helper_argv) {
                        printk(KERN_WARNING "%s failed to allocate memory\n",
                               __func__);
@@ -1956,7 +2026,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                argv_free(helper_argv);
                if (retval) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
-                               corename);
+                               cn.corename);
                        goto close_fail;
                }
        } else {
@@ -1965,7 +2035,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                if (cprm.limit < binfmt->min_coredump)
                        goto fail_unlock;
-                cprm.file = filp_open(corename,
+                cprm.file = filp_open(cn.corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
                                 0600);
                if (IS_ERR(cprm.file))
@@ -2007,6 +2077,8 @@ fail_dropcount:
        if (ispipe)
                atomic_dec(&core_dump_count);
 fail_unlock:
+        kfree(cn.corename);
+fail_corename:
        coredump_finish(mm);
        revert_creds(old_cred);
 fail_creds:
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d91e9d829bc1..dcc941d82d67 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -420,7 +420,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
        err = exofs_write_begin(NULL, page->mapping, pos, len,
                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err)
-                EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+                EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
                          err);
        de->inode_no = cpu_to_le64(inode->i_ino);
@@ -556,7 +556,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
        err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
                                                        &page, NULL);
        if (err)
-                EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+                EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
                          err);
        if (pde)
                pde->rec_len = cpu_to_le16(to - from);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 68cb23e3bb98..b905c79b4f0a 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync)
 {
        int ret;
        struct inode *inode = filp->f_mapping->host;
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0, /* metadata-only; caller takes care of data */
-        };
        struct super_block *sb;
        if (!(inode->i_state & I_DIRTY))
@@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync)
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                return 0;
-        ret = sync_inode(inode, &wbc);
+        ret = sync_inode_metadata(inode, 1);
        /* This is a good place to write the sb */
        /* TODO: Sechedule an sb-sync on create */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3eadd97324b1..a7555238c41a 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -185,7 +185,7 @@ static void update_write_page(struct page *page, int ret)
 /* Called at the end of reads, to optionally unlock pages and update their
 * status.
 */
-static int __readpages_done(struct page_collect *pcol, bool do_unlock)
+static int __readpages_done(struct page_collect *pcol)
 {
        int i;
        u64 resid;
@@ -221,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
                          page_stat ? "bad_bytes" : "good_bytes");
                ret = update_read_page(page, page_stat);
-                if (do_unlock)
+                if (!pcol->read_4_write)
                        unlock_page(page);
                length += PAGE_SIZE;
        }
@@ -236,7 +236,7 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
-        __readpages_done(pcol, true);
+        __readpages_done(pcol);
        atomic_dec(&pcol->sbi->s_curr_pending);
        kfree(pcol);
 }
@@ -257,7 +257,7 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
        }
 }
-static int read_exec(struct page_collect *pcol, bool is_sync)
+static int read_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
        struct exofs_io_state *ios = pcol->ios;
@@ -267,17 +267,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        if (!pcol->pages)
                return 0;
-        /* see comment in _readpage() about sync reads */
-        WARN_ON(is_sync && (pcol->nr_pages != 1));
        ios->pages = pcol->pages;
        ios->nr_pages = pcol->nr_pages;
        ios->length = pcol->length;
        ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
-        if (is_sync) {
+        if (pcol->read_4_write) {
                exofs_oi_read(oi, pcol->ios);
-                return __readpages_done(pcol, false);
+                return __readpages_done(pcol);
        }
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -303,7 +300,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        return 0;
 err:
-        if (!is_sync)
+        if (!pcol->read_4_write)
                _unlock_pcol_pages(pcol, ret, READ);
        pcol_free(pcol);
@@ -356,7 +353,7 @@ static int readpage_strip(void *data, struct page *page)
                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
                             " splitting\n", inode->i_ino, page->index);
-                return read_exec(pcol, false);
+                return read_exec(pcol);
        }
 try_again:
@@ -366,7 +363,7 @@ try_again:
        } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
                   page->index)) {
                /* Discontinuity detected, split the request */
-                ret = read_exec(pcol, false);
+                ret = read_exec(pcol);
                if (unlikely(ret))
                        goto fail;
                goto try_again;
@@ -391,7 +388,7 @@ try_again:
                          page, len, pcol->nr_pages, pcol->length);
                /* split the request, and start again with current page */
-                ret = read_exec(pcol, false);
+                ret = read_exec(pcol);
                if (unlikely(ret))
                        goto fail;
@@ -420,27 +417,24 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
                return ret;
        }
-        return read_exec(&pcol, false);
+        return read_exec(&pcol);
 }
-static int _readpage(struct page *page, bool is_sync)
+static int _readpage(struct page *page, bool read_4_write)
 {
        struct page_collect pcol;
        int ret;
        _pcol_init(&pcol, 1, page->mapping->host);
-        /* readpage_strip might call read_exec(,is_sync==false) at several
+        pcol.read_4_write = read_4_write;
-         * places but not if we have a single page.
-         */
-        pcol.read_4_write = is_sync;
        ret = readpage_strip(&pcol, page);
        if (ret) {
                EXOFS_ERR("_readpage => %d\n", ret);
                return ret;
        }
-        return read_exec(&pcol, is_sync);
+        return read_exec(&pcol);
 }
 /*
@@ -511,7 +505,7 @@ static int write_exec(struct page_collect *pcol)
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
        if (!pcol_copy) {
-                EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+                EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
                ret = -ENOMEM;
                goto err;
        }
@@ -527,7 +521,7 @@ static int write_exec(struct page_collect *pcol)
        ret = exofs_oi_write(oi, ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
+                EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
                goto err;
        }
@@ -628,7 +622,7 @@ try_again:
                /* split the request, next loop will start again */
                ret = write_exec(pcol);
                if (unlikely(ret)) {
-                        EXOFS_DBGMSG("write_exec faild => %d", ret);
+                        EXOFS_DBGMSG("write_exec failed => %d", ret);
                        goto fail;
                }
@@ -719,7 +713,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
                ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
                                         fsdata);
                if (ret) {
-                        EXOFS_DBGMSG("simple_write_begin faild\n");
+                        EXOFS_DBGMSG("simple_write_begin failed\n");
                        goto out;
                }
@@ -732,7 +726,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
                if (ret) {
                        /*SetPageError was done by _readpage. Is it ok?*/
                        unlock_page(page);
-                        EXOFS_DBGMSG("__readpage_filler faild\n");
+                        EXOFS_DBGMSG("__readpage_filler failed\n");
                }
        }
 out:
@@ -1072,8 +1066,10 @@ bad_inode:
 int __exofs_wait_obj_created(struct exofs_i_info *oi)
 {
        if (!obj_created(oi)) {
+                EXOFS_DBGMSG("!obj_created\n");
                BUG_ON(!obj_2bcreated(oi));
                wait_event(oi->i_wq, obj_created(oi));
+                EXOFS_DBGMSG("wait_event done\n");
        }
        return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
 }
@@ -1095,7 +1091,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
        atomic_dec(&sbi->s_curr_pending);
        if (unlikely(ret)) {
-                EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+                EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
                          _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
                /*TODO: When FS is corrupted creation can fail, object already
                 * exist. Get rid of this asynchronous creation, if exist
@@ -1107,7 +1103,6 @@ static void create_done(struct exofs_io_state *ios, void *p)
        set_obj_created(oi);
-        atomic_dec(&inode->i_count);
        wake_up(&oi->i_wq);
 }
@@ -1157,17 +1152,11 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        ios->obj.id = exofs_oi_objno(oi);
        exofs_make_credential(oi->i_cred, &ios->obj);
-        /* increment the refcount so that the inode will still be around when we
-         * reach the callback
-         */
-        atomic_inc(&inode->i_count);
        ios->done = create_done;
        ios->private = inode;
        ios->cred = oi->i_cred;
        ret = exofs_sbi_create(ios);
        if (ret) {
-                atomic_dec(&inode->i_count);
                exofs_put_io_state(ios);
                return ERR_PTR(ret);
        }
@@ -1215,7 +1204,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        args = kzalloc(sizeof(*args), GFP_KERNEL);
        if (!args) {
-                EXOFS_DBGMSG("Faild kzalloc of args\n");
+                EXOFS_DBGMSG("Failed kzalloc of args\n");
                return -ENOMEM;
        }
@@ -1257,12 +1246,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        ios->out_attr_len = 1;
        ios->out_attr = &attr;
-        if (!obj_created(oi)) {
+        wait_obj_created(oi);
-                EXOFS_DBGMSG("!obj_created\n");
-                BUG_ON(!obj_2bcreated(oi));
-                wait_event(oi->i_wq, obj_created(oi));
-                EXOFS_DBGMSG("wait_event done\n");
-        }
        if (!do_sync) {
                args->sbi = sbi;
@@ -1325,12 +1309,12 @@ void exofs_evict_inode(struct inode *inode)
        inode->i_size = 0;
        end_writeback(inode);
-        /* if we are deleting an obj that hasn't been created yet, wait */
+        /* if we are deleting an obj that hasn't been created yet, wait.
-        if (!obj_created(oi)) {
+         * This also makes sure that create_done cannot be called with an
-                BUG_ON(!obj_2bcreated(oi));
+         * already evicted inode.
-                wait_event(oi->i_wq, obj_created(oi));
+         */
-                /* ignore the error attempt a remove anyway */
+        wait_obj_created(oi);
-        }
+        /* ignore the error, attempt a remove anyway */
        /* Now Remove the OSD objects */
        ret = exofs_get_io_state(&sbi->layout, &ios);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 6550bf70e41d..f74a2ec027a6 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -55,7 +55,7 @@ int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
        ret = osd_finalize_request(or, 0, cred, NULL);
        if (unlikely(ret)) {
-                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+                EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
                goto out;
        }
@@ -79,7 +79,7 @@ int exofs_get_io_state(struct exofs_layout *layout,
         */
        ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
        if (unlikely(!ios)) {
-                EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
+                EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
                             exofs_io_state_size(layout->s_numdevs));
                *pios = NULL;
                return -ENOMEM;
@@ -172,7 +172,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
                ret = osd_finalize_request(or, 0, ios->cred, NULL);
                if (unlikely(ret)) {
-                        EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
+                        EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
                                     ret);
                        return ret;
                }
@@ -361,7 +361,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
                per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
                if (unlikely(!per_dev->bio)) {
-                        EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
+                        EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
                                     bio_size);
                        return -ENOMEM;
                }
@@ -564,7 +564,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
                                                  master_dev->bio->bi_max_vecs);
                                if (unlikely(!bio)) {
                                        EXOFS_DBGMSG(
-                                              "Faild to allocate BIO size=%u\n",
+                                              "Failed to allocate BIO size=%u\n",
                                              master_dev->bio->bi_max_vecs);
                                        ret = -ENOMEM;
                                        goto out;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c236863..264e95d02830 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return exofs_add_nondir(dentry, inode);
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 047e92fa3af8..8c6c4669b381 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -150,12 +150,19 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
+static void exofs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
 /*
 * Remove an inode from the cache
 */
 static void exofs_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+        call_rcu(&inode->i_rcu, exofs_i_callback);
 }
 /*
@@ -659,19 +666,19 @@ free_bdi:
 /*
 * Set up the superblock (calls exofs_fill_super eventually)
 */
-static int exofs_get_sb(struct file_system_type *type,
+static struct dentry *exofs_mount(struct file_system_type *type,
                          int flags, const char *dev_name,
-                          void *data, struct vfsmount *mnt)
+                          void *data)
 {
        struct exofs_mountopt opts;
        int ret;
        ret = parse_options(data, &opts);
        if (ret)
-                return ret;
+                return ERR_PTR(ret);
        opts.dev_name = dev_name;
-        return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+        return mount_nodev(type, flags, &opts, exofs_fill_super);
 }
 /*
@@ -809,7 +816,7 @@ static const struct export_operations exofs_export_ops = {
 static struct file_system_type exofs_type = {
        .owner          = THIS_MODULE,
        .name           = "exofs",
-        .get_sb         = exofs_get_sb,
+        .mount          = exofs_mount,
        .kill_sb        = generic_shutdown_super,
 };
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e175949a63..4b6825740dd5 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -43,24 +43,26 @@ find_acceptable_alias(struct dentry *result,
                void *context)
 {
        struct dentry *dentry, *toput = NULL;
+        struct inode *inode;
        if (acceptable(context, result))
                return result;
-        spin_lock(&dcache_lock);
+        inode = result->d_inode;
-        list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
+        spin_lock(&inode->i_lock);
-                dget_locked(dentry);
+        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
-                spin_unlock(&dcache_lock);
+                dget(dentry);
+                spin_unlock(&inode->i_lock);
                if (toput)
                        dput(toput);
                if (dentry != result && acceptable(context, dentry)) {
                        dput(result);
                        return dentry;
                }
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                toput = dentry;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        if (toput)
                dput(toput);
@@ -74,21 +76,20 @@ static struct dentry *
 find_disconnected_root(struct dentry *dentry)
 {
        dget(dentry);
-        spin_lock(&dentry->d_lock);
+        while (!IS_ROOT(dentry)) {
-        while (!IS_ROOT(dentry) &&
+                struct dentry *parent = dget_parent(dentry);
-               (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) {
-                struct dentry *parent = dentry->d_parent;
+                if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
-                dget(parent);
+                        dput(parent);
-                spin_unlock(&dentry->d_lock);
+                        break;
+                }
                dput(dentry);
                dentry = parent;
-                spin_lock(&dentry->d_lock);
        }
-        spin_unlock(&dentry->d_lock);
        return dentry;
 }
 /*
 * Make sure target_dir is fully connected to the dentry tree.
 *
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 2bcc0431bada..7b4180554a62 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -232,10 +232,17 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-ext2_check_acl(struct inode *inode, int mask)
+ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 3ff6cbb9ac44..c939b7b12099 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 /* acl.c */
-extern int ext2_check_acl (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int, unsigned int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index c6c684b44ea1..0d06f4e75699 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -646,10 +646,9 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
        return here;
 }
-/*
+/**
 * ext2_try_to_allocate()
 * @sb:                 superblock
- * @handle:             handle to this transaction
 * @group:              given allocation block group
 * @bitmap_bh:          bufferhead holds the block bitmap
 * @grp_goal:           given target block within the group
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 764109886ec0..47cda410b548 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -28,21 +28,30 @@
 typedef struct ext2_dir_entry_2 ext2_dirent;
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
 static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
 {
        unsigned len = le16_to_cpu(dlen);
+#if (PAGE_CACHE_SIZE >= 65536)
        if (len == EXT2_MAX_REC_LEN)
                return 1 << 16;
+#endif
        return len;
 }
 static inline __le16 ext2_rec_len_to_disk(unsigned len)
 {
+#if (PAGE_CACHE_SIZE >= 65536)
        if (len == (1 << 16))
                return cpu_to_le16(EXT2_MAX_REC_LEN);
        else
                BUG_ON(len > (1 << 16));
+#endif
        return cpu_to_le16(len);
 }
@@ -98,7 +107,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
        if (IS_DIRSYNC(dir)) {
                err = write_one_page(page, 1);
                if (!err)
-                        err = ext2_sync_inode(dir);
+                        err = sync_inode_metadata(dir, 1);
        } else {
                unlock_page(page);
        }
@@ -129,15 +138,15 @@ static void ext2_check_page(struct page *page, int quiet)
                p = (ext2_dirent *)(kaddr + offs);
                rec_len = ext2_rec_len_from_disk(p->rec_len);
-                if (rec_len < EXT2_DIR_REC_LEN(1))
+                if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
                        goto Eshort;
-                if (rec_len & 3)
+                if (unlikely(rec_len & 3))
                        goto Ealign;
-                if (rec_len < EXT2_DIR_REC_LEN(p->name_len))
+                if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
                        goto Enamelen;
-                if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+                if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
                        goto Espan;
-                if (le32_to_cpu(p->inode) > max_inumber)
+                if (unlikely(le32_to_cpu(p->inode) > max_inumber))
                        goto Einumber;
        }
        if (offs != limit)
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 416daa62242c..6346a2acf326 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
 extern struct inode *ext2_iget (struct super_block *, unsigned long);
 extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_evict_inode(struct inode *);
-extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 940c96168868..40ad210a5049 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -458,7 +458,7 @@ failed_out:
 *      the same format as ext2_get_branch() would do. We are calling it after
 *      we had read the existing part of chain and partial points to the last
 *      triple of that (one with zero ->key). Upon the exit we have the same
- *      picture as after the successful ext2_get_block(), excpet that in one
+ *      picture as after the successful ext2_get_block(), except that in one
 *      place chain is disconnected - *branch->p is still zero (we did not
 *      set the last link), but branch->key contains the number that should
 *      be placed into *branch->p to fill that gap.
@@ -662,7 +662,7 @@ static int ext2_get_blocks(struct inode *inode,
        mutex_lock(&ei->truncate_mutex);
        /*
         * If the indirect block is missing while we are reading
-         * the chain(ext3_get_branch() returns -EAGAIN err), or
+         * the chain(ext2_get_branch() returns -EAGAIN err), or
         * if the chain has been changed after we grab the semaphore,
         * (either because another process truncated this branch, or
         * another get_block allocated this branch) re-grab the chain to see if
@@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        if (inode_needs_sync(inode)) {
                sync_mapping_buffers(inode->i_mapping);
-                ext2_sync_inode (inode);
+                sync_inode_metadata(inode, 1);
        } else {
                mark_inode_dirty(inode);
        }
@@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
        return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
-int ext2_sync_inode(struct inode *inode)
-{
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0,       /* sys_fsync did this */
-        };
-        return sync_inode(inode, &wbc);
-}
 int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
        struct inode *inode = dentry->d_inode;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e9a3f2..2e1d8341d827 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -67,7 +67,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
        inode = NULL;
        if (ino) {
                inode = ext2_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                ext2_error(dir->i_sb, __func__,
                                                "deleted inode referenced: %lu",
@@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext2_add_link(dentry, inode);
        if (!err) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1ec602673ea8..7731695e65d9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -43,9 +43,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
-void ext2_error (struct super_block * sb, const char * function,
+void ext2_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
        struct ext2_super_block *es = sbi->s_es;
@@ -59,9 +60,13 @@ void ext2_error (struct super_block * sb, const char * function,
        }
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
@@ -76,12 +81,16 @@ void ext2_error (struct super_block * sb, const char * function,
 void ext2_msg(struct super_block *sb, const char *prefix,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT2-fs (%s): ", prefix, sb->s_id);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
@@ -161,11 +170,18 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ext2_destroy_inode(struct inode *inode)
+static void ext2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
+static void ext2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ext2_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
@@ -747,15 +763,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        __le32 features;
        int err;
+        err = -ENOMEM;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
-                return -ENOMEM;
+                goto failed_unlock;
        sbi->s_blockgroup_lock =
                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
        if (!sbi->s_blockgroup_lock) {
                kfree(sbi);
-                return -ENOMEM;
+                goto failed_unlock;
        }
        sb->s_fs_info = sbi;
        sbi->s_sb_block = sb_block;
@@ -1107,6 +1124,7 @@ failed_sbi:
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
+failed_unlock:
        return ret;
 }
@@ -1219,9 +1237,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        }
        es = sbi->s_es;
-        if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
+        if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
-            (old_mount_opt & EXT2_MOUNT_XIP)) &&
-            invalidate_inodes(sb)) {
                ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
                         "xip flag with busy inodes while remounting");
                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
@@ -1356,10 +1372,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        return 0;
 }
-static int ext2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ext2_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
 }
 #ifdef CONFIG_QUOTA
@@ -1473,7 +1489,7 @@ out:
 static struct file_system_type ext2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext2",
-        .get_sb         = ext2_get_sb,
+        .mount          = ext2_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8c29ae15129e..c2e4dce984d2 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -199,14 +199,6 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
                        goto found;
                entry = next;
        }
-        /* Check the remaining name entries */
-        while (!IS_LAST_ENTRY(entry)) {
-                struct ext2_xattr_entry *next =
-                        EXT2_XATTR_NEXT(entry);
-                if ((char *)next >= end)
-                        goto bad_block;
-                entry = next;
-        }
        if (ext2_xattr_cache_insert(bh))
                ea_idebug(inode, "cache insert failed");
        error = -ENODATA;
@@ -355,7 +347,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
 /*
 * ext2_xattr_set()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
@@ -699,7 +691,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
        EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
        inode->i_ctime = CURRENT_TIME_SEC;
        if (IS_SYNC(inode)) {
-                error = ext2_sync_inode (inode);
+                error = sync_inode_metadata(inode, 1);
                /* In case sync failed due to ENOSPC the inode was actually
                 * written (only some dirty data were not) so we just proceed
                 * as if nothing happened and cleanup the unused block */
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 8a11fe212183..e4fa49e6c539 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -240,10 +240,17 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 int
-ext3_check_acl(struct inode *inode, int mask)
+ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 597334626de9..5faf8048e906 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 /* acl.c */
-extern int ext3_check_acl (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int, unsigned int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 4a32511f4ded..045995c8ce5a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
 #include <linux/ext3_jbd.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 /*
 * balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +40,21 @@
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+        ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+        if (offsetp)
+                *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+        if (blockgrpp)
+                *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
 /**
 * ext3_get_group_desc() -- load group descriptor from disk
 * @sb:                 super block
@@ -792,9 +808,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
        if (here < 0)
                here = 0;
-        p = ((char *)bh->b_data) + (here >> 3);
+        p = bh->b_data + (here >> 3);
        r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-        next = (r - ((char *)bh->b_data)) << 3;
+        next = (r - bh->b_data) << 3;
        if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
                return next;
@@ -810,8 +826,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 /**
 * claim_block()
+ * @lock:               the spin lock for this block group
 * @block:              the free block (group relative) to allocate
- * @bh:                 the bufferhead containts the block group bitmap
+ * @bh:                 the buffer_head contains the block group bitmap
 *
 * We think we can allocate this block in this bitmap.  Try to set the bit.
 * If that succeeds then check that nobody has allocated and then freed the
@@ -956,9 +973,11 @@ fail_access:
 *              but we will shift to the place where start_block is,
 *              then start from there, when looking for a reservable space.
 *
- *      @size: the target new reservation window size
+ *      @my_rsv: the reservation window
+ *
+ *      @sb: the super block
 *
- *      @group_first_block: the first block we consider to start
+ *      @start_block: the first block we consider to start
 *                      the real search from
 *
 *      @last_block:
@@ -1084,7 +1103,7 @@ static int find_next_reservable_window(
 *
 *      failed: we failed to find a reservation window in this group
 *
- *      @rsv: the reservation
+ *      @my_rsv: the reservation window
 *
 *      @grp_goal: The goal (group-relative).  It is where the search for a
 *              free reservable space should start from.
@@ -1273,8 +1292,8 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
 * @group:              given allocation block group
 * @bitmap_bh:          bufferhead holds the block bitmap
 * @grp_goal:           given target block within the group
- * @count:              target number of blocks to allocate
 * @my_rsv:             reservation window
+ * @count:              target number of blocks to allocate
 * @errp:               pointer to store the error code
 *
 * This is the main function used to allocate a new block and its reservation
@@ -1882,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
        return ext3_bg_num_gdb_meta(sb,group);
 }
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:                 super block for file system
+ * @group:              allocation group to trim
+ * @start:              first group block to examine
+ * @max:                last group block to examine
+ * @gdp:                allocation group description structure
+ * @minblocks:          minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
+                                ext3_grpblk_t start, ext3_grpblk_t max,
+                                ext3_grpblk_t minblocks)
+{
+        handle_t *handle;
+        ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+        ext3_fsblk_t discard_block;
+        struct ext3_sb_info *sbi;
+        struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+        struct ext3_group_desc *gdp;
+        int err = 0, ret = 0;
+        /*
+         * We will update one block bitmap, and one group descriptor
+         */
+        handle = ext3_journal_start_sb(sb, 2);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        bitmap_bh = read_block_bitmap(sb, group);
+        if (!bitmap_bh) {
+                err = -EIO;
+                goto err_out;
+        }
+        BUFFER_TRACE(bitmap_bh, "getting undo access");
+        err = ext3_journal_get_undo_access(handle, bitmap_bh);
+        if (err)
+                goto err_out;
+        gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+        if (!gdp) {
+                err = -EIO;
+                goto err_out;
+        }
+        BUFFER_TRACE(gdp_bh, "get_write_access");
+        err = ext3_journal_get_write_access(handle, gdp_bh);
+        if (err)
+                goto err_out;
+        free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+        sbi = EXT3_SB(sb);
+         /* Walk through the whole group */
+        while (start < max) {
+                start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+                if (start < 0)
+                        break;
+                next = start;
+                /*
+                 * Allocate contiguous free extents by setting bits in the
+                 * block bitmap
+                 */
+                while (next < max
+                        && claim_block(sb_bgl_lock(sbi, group),
+                                        next, bitmap_bh)) {
+                        next++;
+                }
+                 /* We did not claim any blocks */
+                if (next == start)
+                        continue;
+                discard_block = (ext3_fsblk_t)start +
+                                ext3_group_first_block_no(sb, group);
+                /* Update counters */
+                spin_lock(sb_bgl_lock(sbi, group));
+                le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+                spin_unlock(sb_bgl_lock(sbi, group));
+                percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+                /* Do not issue a TRIM on extents smaller than minblocks */
+                if ((next - start) < minblocks)
+                        goto free_extent;
+                 /* Send the TRIM command down to the device */
+                err = sb_issue_discard(sb, discard_block, next - start,
+                                       GFP_NOFS, 0);
+                count += (next - start);
+free_extent:
+                freed = 0;
+                /*
+                 * Clear bits in the bitmap
+                 */
+                for (bit = start; bit < next; bit++) {
+                        BUFFER_TRACE(bitmap_bh, "clear bit");
+                        if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+                                                bit, bitmap_bh->b_data)) {
+                                ext3_error(sb, __func__,
+                                        "bit already cleared for block "E3FSBLK,
+                                         (unsigned long)bit);
+                                BUFFER_TRACE(bitmap_bh, "bit already cleared");
+                        } else {
+                                freed++;
+                        }
+                }
+                /* Update couters */
+                spin_lock(sb_bgl_lock(sbi, group));
+                le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+                spin_unlock(sb_bgl_lock(sbi, group));
+                percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+                start = next;
+                if (err < 0) {
+                        if (err != -EOPNOTSUPP)
+                                ext3_warning(sb, __func__, "Discard command "
+                                             "returned error %d\n", err);
+                        break;
+                }
+                if (fatal_signal_pending(current)) {
+                        err = -ERESTARTSYS;
+                        break;
+                }
+                cond_resched();
+                /* No more suitable extents */
+                if ((free_blocks - count) < minblocks)
+                        break;
+        }
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+        if (!err)
+                err = ret;
+        /* And the group descriptor block */
+        BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+        ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+        if (!err)
+                err = ret;
+        ext3_debug("trimmed %d blocks in the group %d\n",
+                count, group);
+err_out:
+        if (err)
+                count = err;
+        ext3_journal_stop(handle);
+        brelse(bitmap_bh);
+        return count;
+}
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb:                 superblock for filesystem
+ * @start:              First Byte to trim
+ * @len:                number of Bytes to trim from start
+ * @minlen:             minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        ext3_grpblk_t last_block, first_block, free_blocks;
+        unsigned long first_group, last_group;
+        unsigned long group, ngroups;
+        struct ext3_group_desc *gdp;
+        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+        uint64_t start, len, minlen, trimmed;
+        ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+        int ret = 0;
+        start = range->start >> sb->s_blocksize_bits;
+        len = range->len >> sb->s_blocksize_bits;
+        minlen = range->minlen >> sb->s_blocksize_bits;
+        trimmed = 0;
+        if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
+                return -EINVAL;
+        if (start >= max_blks)
+                goto out;
+        if (start < le32_to_cpu(es->s_first_data_block)) {
+                len -= le32_to_cpu(es->s_first_data_block) - start;
+                start = le32_to_cpu(es->s_first_data_block);
+        }
+        if (start + len > max_blks)
+                len = max_blks - start;
+        ngroups = EXT3_SB(sb)->s_groups_count;
+        smp_rmb();
+        /* Determine first and last group to examine based on start and len */
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+                                     &first_group, &first_block);
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
+                                     &last_group, &last_block);
+        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+        last_block = EXT3_BLOCKS_PER_GROUP(sb);
+        if (first_group > last_group)
+                return -EINVAL;
+        for (group = first_group; group <= last_group; group++) {
+                gdp = ext3_get_group_desc(sb, group, NULL);
+                if (!gdp)
+                        break;
+                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+                if (free_blocks < minlen)
+                        continue;
+                if (len >= EXT3_BLOCKS_PER_GROUP(sb))
+                        len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
+                else
+                        last_block = first_block + len;
+                ret = ext3_trim_all_free(sb, group, first_block,
+                                        last_block, minlen);
+                if (ret < 0)
+                        break;
+                trimmed += ret;
+                first_block = 0;
+        }
+        if (ret >= 0)
+                ret = 0;
+out:
+        range->len = trimmed * sb->s_blocksize;
+        return ret;
+}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf6..34f0a072b935 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
        const char * error_msg = NULL;
        const int rlen = ext3_rec_len_from_disk(de->rec_len);
-        if (rlen < EXT3_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
-        if (error_msg != NULL)
+        if (unlikely(error_msg != NULL))
                ext3_error (dir->i_sb, function,
                        "bad entry in directory #%lu: %s - "
                        "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
                        dir->i_ino, error_msg, offset,
                        (unsigned long) le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
 }
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a6..09b13bb34c94 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
         * storage
         */
        if (needs_barrier)
-                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                                BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4ab72db3559e..9724aef22460 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -570,9 +570,14 @@ got:
        ei->i_state_flags = 0;
        ext3_set_inode_state(inode, EXT3_STATE_NEW);
-        ei->i_extra_isize =
+        /* See comment in ext3_iget for explanation */
-                (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
+        if (ino >= EXT3_FIRST_INO(sb) + 1 &&
-                sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
+            EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
+                ei->i_extra_isize =
+                        sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
+        } else {
+                ei->i_extra_isize = 0;
+        }
        ret = inode;
        dquot_initialize(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5e0faf4cda79..ae94f6d949f5 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -498,7 +498,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
 }
 /**
- *      ext3_blks_to_allocate: Look up the block map and count the number
+ *      ext3_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -536,14 +536,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 }
 /**
- *      ext3_alloc_blocks: multiple allocate blocks needed for a branch
+ *      ext3_alloc_blocks - multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: owner
+ *      @goal: preferred place for allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks:  number of blocks need to allocated for direct blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: here we store the error value
- *              direct blocks
+ *
+ *      return the number of direct blocks allocated
 */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
                        ext3_fsblk_t goal, int indirect_blks, int blks,
@@ -598,9 +602,11 @@ failed_out:
 /**
 *      ext3_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -700,10 +706,9 @@ failed:
 /**
 * ext3_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *      ext3_alloc_branch)
 * @where: location of missing link
 * @num:   number of indirect blocks we are adding
 * @blks:  number of direct blocks we are adding
@@ -1696,8 +1701,8 @@ static int ext3_journalled_writepage(struct page *page,
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
-                                        ext3_get_block);
+                                          ext3_get_block);
                if (ret != 0) {
                        ext3_journal_stop(handle);
                        goto out_unlock;
@@ -2140,13 +2145,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                        ext3_journal_dirty_metadata(handle, bh);
+                        if (ext3_journal_dirty_metadata(handle, bh))
+                                return;
                }
                ext3_mark_inode_dirty(handle, inode);
                truncate_restart_transaction(handle, inode);
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
-                        ext3_journal_get_write_access(handle, bh);
+                        if (ext3_journal_get_write_access(handle, bh))
+                                return;
                }
        }
@@ -2530,7 +2537,6 @@ void ext3_truncate(struct inode *inode)
                         */
                } else {
                        /* Shared branch grows from an indirect block */
-                        BUFFER_TRACE(partial->bh, "get_write_access");
                        ext3_free_branches(handle, inode, partial->bh,
                                        partial->p,
                                        partial->p+1, (chain+n-1) - partial);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 88974814783a..fc080dd561f7 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -276,7 +276,29 @@ group_add_out:
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
+        case FITRIM: {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                                   sizeof(range)))
+                        return -EFAULT;
+                ret = ext3_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                                 sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
        default:
                return -ENOTTY;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2b35ddb70d65..b27ba71810ec 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -858,6 +858,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
        struct buffer_head * bh_use[NAMEI_RA_SIZE];
        struct buffer_head * bh, *ret = NULL;
        unsigned long start, block, b;
+        const u8 *name = entry->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
@@ -871,6 +872,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
        namelen = entry->len;
        if (namelen > EXT3_NAME_LEN)
                return NULL;
+        if ((namelen <= 2) && (name[0] == '.') &&
+            (name[1] == '.' || name[1] == 0)) {
+                /*
+                 * "." or ".." will only be in the first block
+                 * NFS may look up ".."; "." should be handled by the VFS
+                 */
+                block = start = 0;
+                nblocks = 1;
+                goto restart;
+        }
        if (is_dx(dir)) {
                bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
                /*
@@ -961,55 +972,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
                        struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
                        int *err)
 {
-        struct super_block * sb;
+        struct super_block *sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-        u32 hash;
        struct dx_frame frames[2], *frame;
-        struct ext3_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        unsigned long block;
        int retval;
-        int namelen = entry->len;
-        const u8 *name = entry->name;
-        sb = dir->i_sb;
+        if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-        /* NFS may look up ".." - look at dx_root directory block */
+                return NULL;
-        if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
-                if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-                        return NULL;
-        } else {
-                frame = frames;
-                frame->bh = NULL;                       /* for dx_release() */
-                frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-                dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-        }
-        hash = hinfo.hash;
        do {
                block = dx_get_block(frame->at);
                if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
                        goto errout;
-                de = (struct ext3_dir_entry_2 *) bh->b_data;
-                top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                       EXT3_DIR_REC_LEN(0));
-                for (; de < top; de = ext3_next_entry(de)) {
-                        int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
-                                  + ((char *) de - bh->b_data);
-                        if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
-                                brelse(bh);
-                                *err = ERR_BAD_DX_DIR;
-                                goto errout;
-                        }
-                        if (ext3_match(namelen, name, de)) {
+                retval = search_dirblock(bh, dir, entry,
-                                *res_dir = de;
+                                         block << EXT3_BLOCK_SIZE_BITS(sb),
-                                dx_release(frames);
+                                         res_dir);
-                                return bh;
+                if (retval == 1) {
-                        }
+                        dx_release(frames);
+                        return bh;
                }
-                brelse (bh);
+                brelse(bh);
+                if (retval == -1) {
+                        *err = ERR_BAD_DX_DIR;
+                        goto errout;
+                }
                /* Check to see if we should continue to search */
-                retval = ext3_htree_next_block(dir, hash, frame,
+                retval = ext3_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext3_warning(sb, __func__,
@@ -1047,7 +1038,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
                        return ERR_PTR(-EIO);
                }
                inode = ext3_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                ext3_error(dir->i_sb, __func__,
                                                "deleted inode referenced: %lu",
@@ -1607,7 +1598,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext3_journal_dirty_metadata(handle, frames[0].bh);
+                err = ext3_journal_dirty_metadata(handle, frames[0].bh);
+                if (err)
+                        goto journal_error;
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1644,8 +1637,13 @@ static int ext3_delete_entry (handle_t *handle,
                if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
+                        int err;
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext3_journal_get_write_access(handle, bh);
+                        err = ext3_journal_get_write_access(handle, bh);
+                        if (err)
+                                goto journal_error;
                        if (pde)
                                pde->rec_len = ext3_rec_len_to_disk(
                                        ext3_rec_len_from_disk(pde->rec_len) +
@@ -1654,7 +1652,12 @@ static int ext3_delete_entry (handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                        ext3_journal_dirty_metadata(handle, bh);
+                        err = ext3_journal_dirty_metadata(handle, bh);
+                        if (err) {
+journal_error:
+                                ext3_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext3_rec_len_from_disk(de->rec_len);
@@ -1762,7 +1765,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 {
        handle_t *handle;
        struct inode * inode;
-        struct buffer_head * dir_block;
+        struct buffer_head * dir_block = NULL;
        struct ext3_dir_entry_2 * de;
        int err, retries = 0;
@@ -1790,15 +1793,14 @@ retry:
        inode->i_fop = &ext3_dir_operations;
        inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
-        if (!dir_block) {
+        if (!dir_block)
-                drop_nlink(inode); /* is this nlink == 0? */
+                goto out_clear_inode;
-                unlock_new_inode(inode);
-                ext3_mark_inode_dirty(handle, inode);
-                iput (inode);
-                goto out_stop;
-        }
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext3_journal_get_write_access(handle, dir_block);
+        err = ext3_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext3_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1814,11 +1816,16 @@ retry:
        ext3_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
-        ext3_journal_dirty_metadata(handle, dir_block);
+        err = ext3_journal_dirty_metadata(handle, dir_block);
-        brelse (dir_block);
+        if (err)
-        ext3_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext3_add_entry (handle, dentry, inode);
+        err = ext3_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext3_add_entry (handle, dentry, inode);
        if (err) {
+out_clear_inode:
                inode->i_nlink = 0;
                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
@@ -1827,10 +1834,14 @@ retry:
        }
        inc_nlink(dir);
        ext3_update_dx_flag(dir);
-        ext3_mark_inode_dirty(handle, dir);
+        err = ext3_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -2260,7 +2271,7 @@ retry:
        inode->i_ctime = CURRENT_TIME_SEC;
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext3_add_entry(handle, dentry, inode);
        if (!err) {
@@ -2353,7 +2364,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                        goto end_rename;
        } else {
                BUFFER_TRACE(new_bh, "get write access");
-                ext3_journal_get_write_access(handle, new_bh);
+                retval = ext3_journal_get_write_access(handle, new_bh);
+                if (retval)
+                        goto journal_error;
                new_de->inode = cpu_to_le32(old_inode->i_ino);
                if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
                                              EXT3_FEATURE_INCOMPAT_FILETYPE))
@@ -2362,7 +2375,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
                ext3_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, new_bh);
+                retval = ext3_journal_dirty_metadata(handle, new_bh);
+                if (retval)
+                        goto journal_error;
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2411,10 +2426,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
        ext3_update_dx_flag(old_dir);
        if (dir_bh) {
                BUFFER_TRACE(dir_bh, "get_write_access");
-                ext3_journal_get_write_access(handle, dir_bh);
+                retval = ext3_journal_get_write_access(handle, dir_bh);
+                if (retval)
+                        goto journal_error;
                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, dir_bh);
+                retval = ext3_journal_dirty_metadata(handle, dir_bh);
+                if (retval) {
+journal_error:
+                        ext3_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                drop_nlink(old_dir);
                if (new_inode) {
                        drop_nlink(new_inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 0ccd7b12b73c..108b142e11ed 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -249,7 +249,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext3_journal_dirty_metadata(handle, gdb);
+                err = ext3_journal_dirty_metadata(handle, gdb);
+                if (err) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext3_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -269,7 +273,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(gdb);
                        goto exit_bh;
                }
-                ext3_journal_dirty_metadata(handle, gdb);
+                err = ext3_journal_dirty_metadata(handle, gdb);
+                if (err) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext3_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -295,7 +303,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(it);
                        goto exit_bh;
                }
-                ext3_journal_dirty_metadata(handle, it);
+                err = ext3_journal_dirty_metadata(handle, it);
+                if (err) {
+                        brelse(it);
+                        goto exit_bh;
+                }
                brelse(it);
                ext3_set_bit(bit, bh->b_data);
        }
@@ -306,7 +318,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
                        bh->b_data);
-        ext3_journal_dirty_metadata(handle, bh);
+        err = ext3_journal_dirty_metadata(handle, bh);
+        if (err)
+                goto exit_bh;
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
@@ -319,7 +333,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
                        bh->b_data);
-        ext3_journal_dirty_metadata(handle, bh);
+        err = ext3_journal_dirty_metadata(handle, bh);
 exit_bh:
        brelse(bh);
@@ -503,12 +517,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
-        ext3_journal_dirty_metadata(handle, dind);
+        err = ext3_journal_dirty_metadata(handle, dind);
+        if (err)
+                goto exit_group_desc;
        brelse(dind);
+        dind = NULL;
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
-        ext3_mark_iloc_dirty(handle, inode, &iloc);
+        err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+        if (err)
+                goto exit_group_desc;
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext3_journal_dirty_metadata(handle, *primary);
+        err = ext3_journal_dirty_metadata(handle, *primary);
+        if (err)
+                goto exit_group_desc;
        o_group_desc = EXT3_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -519,10 +540,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        if (err)
+                goto exit_inode;
        return 0;
+exit_group_desc:
+        kfree(n_group_desc);
 exit_inode:
        //ext3_journal_release_buffer(handle, iloc.bh);
        brelse(iloc.bh);
@@ -706,16 +731,20 @@ static void update_backups(struct super_block *sb,
                }
                ext3_debug("update metadata backup %#04lx\n",
                          (unsigned long)bh->b_blocknr);
-                if ((err = ext3_journal_get_write_access(handle, bh)))
+                if ((err = ext3_journal_get_write_access(handle, bh))) {
+                        brelse(bh);
                        break;
+                }
                lock_buffer(bh);
                memcpy(bh->b_data, data, size);
                if (rest)
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext3_journal_dirty_metadata(handle, bh);
+                err = ext3_journal_dirty_metadata(handle, bh);
                brelse(bh);
+                if (err)
+                        break;
        }
        if ((err2 = ext3_journal_stop(handle)) && !err)
                err = err2;
@@ -922,7 +951,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext3_journal_dirty_metadata(handle, primary);
+        err = ext3_journal_dirty_metadata(handle, primary);
+        if (err)
+                goto exit_journal;
        /* Update the reserved block counts only once the new group is
         * active. */
@@ -934,7 +965,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT3_INODES_PER_GROUP(sb));
-        ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
 exit_journal:
        mutex_unlock(&sbi->s_resize_lock);
@@ -977,7 +1008,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        o_blocks_count = le32_to_cpu(es->s_blocks_count);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
+                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
+                       " upto "E3FSBLK" blocks\n",
                       o_blocks_count, n_blocks_count);
        if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -985,7 +1017,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
                printk(KERN_ERR "EXT3-fs: filesystem on %s:"
-                        " too large to resize to %lu blocks safely\n",
+                        " too large to resize to "E3FSBLK" blocks safely\n",
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
                        ext3_warning(sb, __func__,
@@ -1063,13 +1095,19 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                goto exit_put;
        }
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
-        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
-        ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
+        if (err) {
-                   o_blocks_count + add);
+                ext3_warning(sb, __func__,
+                             "error %d on journal dirty metadata", err);
+                ext3_journal_stop(handle);
+                goto exit_put;
+        }
+        ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
+                   o_blocks_count, o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-        ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
+        ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
-                   o_blocks_count + add);
+                   o_blocks_count, o_blocks_count + add);
        if ((err = ext3_journal_stop(handle)))
                goto exit_put;
        if (test_opt(sb, DEBUG))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5dbf4dba03c4..85c8cc8f2473 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,7 +27,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
@@ -144,12 +143,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
 void ext3_msg(struct super_block *sb, const char *prefix,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
@@ -196,15 +199,20 @@ static void ext3_handle_error(struct super_block *sb)
                        sb->s_id);
 }
-void ext3_error (struct super_block * sb, const char * function,
+void ext3_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        ext3_handle_error(sb);
@@ -275,15 +283,20 @@ void __ext3_std_error (struct super_block * sb, const char * function,
 * case we take the easy way out and panic immediately.
 */
-void ext3_abort (struct super_block * sb, const char * function,
+void ext3_abort(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                 const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
@@ -301,16 +314,20 @@ void ext3_abort (struct super_block * sb, const char * function,
                journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
-void ext3_warning (struct super_block * sb, const char * function,
+void ext3_warning(struct super_block *sb, const char *function,
-                   const char * fmt, ...)
+                  const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
-               sb->s_id, function);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
 }
@@ -347,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
-        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
        if (IS_ERR(bdev))
                goto fail;
        return bdev;
@@ -364,8 +381,7 @@ fail:
 */
 static int ext3_blkdev_put(struct block_device *bdev)
 {
-        bd_release(bdev);
+        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -411,9 +427,6 @@ static void ext3_put_super (struct super_block * sb)
        int i, err;
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-        lock_kernel();
        ext3_xattr_put_super(sb);
        err = journal_destroy(sbi->s_journal);
        sbi->s_journal = NULL;
@@ -462,8 +475,6 @@ static void ext3_put_super (struct super_block * sb)
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        unlock_kernel();
 }
 static struct kmem_cache *ext3_inode_cachep;
@@ -485,6 +496,13 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
+static void ext3_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+}
 static void ext3_destroy_inode(struct inode *inode)
 {
        if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
@@ -495,7 +513,7 @@ static void ext3_destroy_inode(struct inode *inode)
                                false);
                dump_stack();
        }
-        kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+        call_rcu(&inode->i_rcu, ext3_i_callback);
 }
 static void init_once(void *foo)
@@ -736,7 +754,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path);
+                         struct path *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1306,9 +1324,9 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
                ext3_msg(sb, KERN_WARNING,
                        "warning: mounting fs with errors, "
                        "running e2fsck is recommended");
-        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
-                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+                        le16_to_cpu(es->s_max_mnt_count))
                ext3_msg(sb, KERN_WARNING,
                        "warning: maximal mount count reached, "
                        "running e2fsck is recommended");
@@ -1325,7 +1343,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
                   valid forever! :) */
        es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
 #endif
-        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+        if (!le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        es->s_mtime = cpu_to_le32(get_seconds());
@@ -1627,8 +1645,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_resgid = EXT3_DEF_RESGID;
        sbi->s_sb_block = sb_block;
-        unlock_kernel();
        blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
        if (!blocksize) {
                ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
@@ -1654,7 +1670,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
         * Note: s_es must be initialized as soon as possible because
         *       some ext3 macro-instructions depend on its value
         */
-        es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+        es = (struct ext3_super_block *) (bh->b_data + offset);
        sbi->s_es = es;
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT3_SUPER_MAGIC)
@@ -1765,7 +1781,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                               "error: can't read superblock on 2nd try");
                        goto failed_mount;
                }
-                es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
+                es = (struct ext3_super_block *)(bh->b_data + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
                        ext3_msg(sb, KERN_ERR,
@@ -1849,13 +1865,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        if (le32_to_cpu(es->s_blocks_count) >
+        err = generic_check_addressable(sb->s_blocksize_bits,
-                    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+                                        le32_to_cpu(es->s_blocks_count));
+        if (err) {
                ext3_msg(sb, KERN_ERR,
                        "error: filesystem is too large to mount safely");
                if (sizeof(sector_t) < 8)
                        ext3_msg(sb, KERN_ERR,
                                "error: CONFIG_LBDAF not enabled");
+                ret = err;
                goto failed_mount;
        }
@@ -1864,13 +1882,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
                               le32_to_cpu(es->s_first_data_block) - 1)
                                       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
-        db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
+        db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
-                   EXT3_DESC_PER_BLOCK(sb);
        sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
                                    GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
                ext3_msg(sb, KERN_ERR,
                        "error: not enough memory");
+                ret = -ENOMEM;
                goto failed_mount;
        }
@@ -1958,6 +1976,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        }
        if (err) {
                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
+                ret = err;
                goto failed_mount3;
        }
@@ -2025,7 +2044,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
                "writeback");
-        lock_kernel();
        return 0;
 cantfind_ext3:
@@ -2055,7 +2073,6 @@ out_fail:
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        lock_kernel();
        return ret;
 }
@@ -2144,13 +2161,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        if (bdev == NULL)
                return NULL;
-        if (bd_claim(bdev, sb)) {
-                ext3_msg(sb, KERN_ERR,
-                        "error: failed to claim external journal device");
-                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-                return NULL;
-        }
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
@@ -2168,7 +2178,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
                goto out_bdev;
        }
-        es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+        es = (struct ext3_super_block *) (bh->b_data + offset);
        if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -2299,7 +2309,7 @@ static int ext3_load_journal(struct super_block *sb,
        EXT3_SB(sb)->s_journal = journal;
        ext3_clear_journal_err(sb, es);
-        if (journal_devnum &&
+        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
@@ -2361,6 +2371,21 @@ static int ext3_commit_super(struct super_block *sb,
        if (!sbh)
                return error;
+        if (buffer_write_io_error(sbh)) {
+                /*
+                 * Oh, dear.  A previous attempt to write the
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                ext3_msg(sb, KERN_ERR, "previous I/O error to "
+                       "superblock detected");
+                clear_buffer_write_io_error(sbh);
+                set_buffer_uptodate(sbh);
+        }
        /*
         * If the file system is mounted read-only, don't update the
         * superblock write time.  This avoids updating the superblock
@@ -2377,8 +2402,15 @@ static int ext3_commit_super(struct super_block *sb,
        es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
-        if (sync)
+        if (sync) {
                error = sync_dirty_buffer(sbh);
+                if (buffer_write_io_error(sbh)) {
+                        ext3_msg(sb, KERN_ERR, "I/O error while writing "
+                               "superblock");
+                        clear_buffer_write_io_error(sbh);
+                        set_buffer_uptodate(sbh);
+                }
+        }
        return error;
 }
@@ -2538,8 +2570,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        int i;
 #endif
-        lock_kernel();
        /* Store the original options */
        lock_super(sb);
        old_sb_flags = sb->s_flags;
@@ -2648,7 +2678,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                        kfree(old_opts.s_qf_names[i]);
 #endif
        unlock_super(sb);
-        unlock_kernel();
        if (enable_quota)
                dquot_resume(sb, -1);
@@ -2669,7 +2698,6 @@ restore_opts:
        }
 #endif
        unlock_super(sb);
-        unlock_kernel();
        return err;
 }
@@ -2849,27 +2877,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
 * Standard function to be called on quota_on
 */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name)
+                         struct path *path)
 {
        int err;
-        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        err = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (err)
-                return err;
        /* Quotafile not on the same filesystem? */
-        if (path.mnt->mnt_sb != sb) {
+        if (path->mnt->mnt_sb != sb)
-                path_put(&path);
                return -EXDEV;
-        }
        /* Journaling quota? */
        if (EXT3_SB(sb)->s_qf_names[type]) {
                /* Quotafile not of fs root? */
-                if (path.dentry->d_parent != sb->s_root)
+                if (path->dentry->d_parent != sb->s_root)
                        ext3_msg(sb, KERN_WARNING,
                                "warning: Quota file not on filesystem root. "
                                "Journaled quota will not work.");
@@ -2879,7 +2900,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
         * When we journal data on quota file, we have to flush journal to see
         * all updates to the file when we bypass pagecache...
         */
-        if (ext3_should_journal_data(path.dentry->d_inode)) {
+        if (ext3_should_journal_data(path->dentry->d_inode)) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
@@ -2887,15 +2908,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
                journal_lock_updates(EXT3_SB(sb)->s_journal);
                err = journal_flush(EXT3_SB(sb)->s_journal);
                journal_unlock_updates(EXT3_SB(sb)->s_journal);
-                if (err) {
+                if (err)
-                        path_put(&path);
                        return err;
-                }
        }
-        err = dquot_quota_on_path(sb, type, format_id, &path);
+        return dquot_quota_on(sb, type, format_id, path);
-        path_put(&path);
-        return err;
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
@@ -3010,16 +3027,16 @@ out:
 #endif
-static int ext3_get_sb(struct file_system_type *fs_type,
+static struct dentry *ext3_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
 }
 static struct file_system_type ext3_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext3",
-        .get_sb         = ext3_get_sb,
+        .mount          = ext3_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e69dc6dfaa89..32e6cc23bd9a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext3_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8867b2a1e5fe..c947e36eda6c 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
-ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ead..e0270d1f8d82 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,10 +238,17 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 int
-ext4_check_acl(struct inode *inode, int mask)
+ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac4..dec821168fd4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 /* acl.c */
-extern int ext4_check_acl(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int, unsigned int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd30799a43ed..adf96b822781 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * less than the blocksize * 8 ( which is the size
                 * of bitmap ), set rest of the block bitmap to 1
                 */
-                mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
+                ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
+                                     bh->b_data);
        }
        return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
@@ -489,7 +490,7 @@ error_return:
 * Check if filesystem has nblocks free & available for allocation.
 * On success return 1, return 0 on failure.
 */
-int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
        s64 free_blocks, dirty_blocks, root_blocks;
        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -591,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
-        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
+        if (!(*errp) &&
+            ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3db5084db9bd..fac90f3fba80 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -29,16 +29,15 @@ struct ext4_system_zone {
 static struct kmem_cache *ext4_system_zone_cachep;
-int __init init_ext4_system_zone(void)
+int __init ext4_init_system_zone(void)
 {
-        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
-                                             SLAB_RECLAIM_ACCOUNT);
        if (ext4_system_zone_cachep == NULL)
                return -ENOMEM;
        return 0;
 }
-void exit_ext4_system_zone(void)
+void ext4_exit_system_zone(void)
 {
        kmem_cache_destroy(ext4_system_zone_cachep);
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 374510f72baa..164c56092e58 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
                                struct file *filp);
 const struct file_operations ext4_dir_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = ext4_llseek,
        .read           = generic_read_dir,
        .readdir        = ext4_readdir,         /* we take BKL. needed?*/
        .unlocked_ioctl = ext4_ioctl,
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
        return (ext4_filetype_table[filetype]);
 }
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ */
 int __ext4_check_dir_entry(const char *function, unsigned int line,
-                           struct inode *dir,
+                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
                           struct buffer_head *bh,
                           unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
-        if (rlen < EXT4_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely(((char *) de - bh->b_data) + rlen >
+                          dir->i_sb->s_blocksize))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
+        else
+                return 0;
-        if (error_msg != NULL)
+        if (filp)
-                ext4_error_inode(dir, function, line, bh->b_blocknr,
+                ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
-                        "bad entry in directory: %s - "
+                                "bad entry in directory: %s - offset=%u(%u), "
-                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+                                "inode=%u, rec_len=%d, name_len=%d",
-                        error_msg, (unsigned) (offset%bh->b_size), offset,
+                                error_msg, (unsigned) (offset%bh->b_size),
-                        le32_to_cpu(de->inode),
+                                offset, le32_to_cpu(de->inode),
-                        rlen, de->name_len);
+                                rlen, de->name_len);
-        return error_msg == NULL ? 1 : 0;
+        else
+                ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+                                "bad entry in directory: %s - offset=%u(%u), "
+                                "inode=%u, rec_len=%d, name_len=%d",
+                                error_msg, (unsigned) (offset%bh->b_size),
+                                offset, le32_to_cpu(de->inode),
+                                rlen, de->name_len);
+        return 1;
 }
 static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                EXT4_ERROR_INODE(inode, "directory "
+                                EXT4_ERROR_FILE(filp, 0,
-                                           "contains a hole at offset %Lu",
+                                                "directory contains a "
+                                                "hole at offset %llu",
                                           (unsigned long long) filp->f_pos);
                                dir_has_error = 1;
                        }
@@ -194,8 +210,8 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (!ext4_check_dir_entry(inode, de,
+                        if (ext4_check_dir_entry(inode, filp, de,
-                                                  bh, offset)) {
+                                                 bh, offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
                                 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 889ec9d5e6ad..0c8d97b56f34 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
 #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)                 \
        ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
-#define EXT4_ERROR_FILE(file, fmt, a...)        \
+#define EXT4_ERROR_FILE(file, block, fmt, a...)                         \
-        ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
+        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -168,7 +168,20 @@ struct mpage_da_data {
        int pages_written;
        int retval;
 };
-#define EXT4_IO_UNWRITTEN       0x1
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define EXT4_IO_END_UNWRITTEN   0x0001
+#define EXT4_IO_END_ERROR       0x0002
+struct ext4_io_page {
+        struct page     *p_page;
+        atomic_t        p_count;
+};
+#define MAX_IO_PAGES 128
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
        struct inode            *inode;         /* file being written to */
@@ -179,8 +192,18 @@ typedef struct ext4_io_end {
        struct work_struct      work;           /* data work queue */
        struct kiocb            *iocb;          /* iocb struct for AIO */
        int                     result;         /* error value for AIO */
+        int                     num_io_pages;
+        struct ext4_io_page     *pages[MAX_IO_PAGES];
 } ext4_io_end_t;
+struct ext4_io_submit {
+        int                     io_op;
+        struct bio              *io_bio;
+        ext4_io_end_t           *io_end;
+        struct ext4_io_page     *io_page;
+        sector_t                io_next_block;
+};
 /*
 * Special inodes numbers
 */
@@ -205,6 +228,7 @@ typedef struct ext4_io_end {
 #define EXT4_MIN_BLOCK_SIZE             1024
 #define EXT4_MAX_BLOCK_SIZE             65536
 #define EXT4_MIN_BLOCK_LOG_SIZE         10
+#define EXT4_MAX_BLOCK_LOG_SIZE         16
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE(s)             ((s)->s_blocksize)
 #else
@@ -537,23 +561,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
 #endif
+/* Max physical block we can address w/o extents */
-/*
- *  Mount options
- */
-struct ext4_mount_options {
-        unsigned long s_mount_opt;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned long s_commit_interval;
-        u32 s_min_batch_time, s_max_batch_time;
-#ifdef CONFIG_QUOTA
-        int s_jquota_fmt;
-        char *s_qf_names[MAXQUOTAS];
-#endif
-};
-/* Max physical block we can addres w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF
 /*
@@ -685,6 +693,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
                ext4_decode_extra_time(&(inode)->xtime,                        \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (inode)->xtime.tv_nsec = 0;                                    \
 } while (0)
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                        \
@@ -695,6 +705,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))            \
                ext4_decode_extra_time(&(einode)->xtime,                       \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (einode)->xtime.tv_nsec = 0;                                   \
 } while (0)
 #define i_disk_version osd1.linux1.l_i_version
@@ -726,12 +738,13 @@ do {									       \
 /*
 * storage for cached extent
+ * If ec_len == 0, then the cache is invalid.
+ * If ec_start == 0, then the cache represents a gap (null mapping)
 */
 struct ext4_ext_cache {
        ext4_fsblk_t    ec_start;
        ext4_lblk_t     ec_block;
        __u32           ec_len; /* must be 32bit to return holes */
-        __u32           ec_type;
 };
 /*
@@ -750,10 +763,12 @@ struct ext4_inode_info {
         * near to their parent directory's inode.
         */
        ext4_group_t    i_block_group;
+        ext4_lblk_t     i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
        unsigned long   i_state_flags;          /* Dynamic state flags */
+#endif
        unsigned long   i_flags;
-        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
        /*
         * Extended attributes can be read independently of the main file
@@ -796,7 +811,7 @@ struct ext4_inode_info {
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
-        struct jbd2_inode jinode;
+        struct jbd2_inode *jinode;
        struct ext4_ext_cache i_cached_extent;
        /*
@@ -816,14 +831,12 @@ struct ext4_inode_info {
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
        unsigned int i_allocated_meta_blocks;
-        unsigned short i_delalloc_reserved_flag;
+        ext4_lblk_t i_da_metadata_calc_last_lblock;
-        sector_t i_da_metadata_calc_last_lblock;
        int i_da_metadata_calc_len;
        /* on-disk additional length */
        __u16 i_extra_isize;
-        spinlock_t i_block_reservation_lock;
 #ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
@@ -832,9 +845,12 @@ struct ext4_inode_info {
        /* completed IOs that might need unwritten extents handling */
        struct list_head i_completed_io_list;
        spinlock_t i_completed_io_lock;
+        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        spinlock_t i_block_reservation_lock;
        /*
         * Transactions that contain inode's metadata needed to complete
         * fsync and fdatasync, respectively.
@@ -885,16 +901,27 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBLK_IO_SUBMIT       0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE     0x80000000 /* Initialize uninitialized itables */
-#define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
+#define clear_opt(sb, opt)              EXT4_SB(sb)->s_mount_opt &= \
-#define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
+                                                ~EXT4_MOUNT_##opt
+#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
+                                                EXT4_MOUNT_##opt
 #define test_opt(sb, opt)               (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)
+#define clear_opt2(sb, opt)             EXT4_SB(sb)->s_mount_opt2 &= \
+                                                ~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)               EXT4_SB(sb)->s_mount_opt2 |= \
+                                                EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)              (EXT4_SB(sb)->s_mount_opt2 & \
+                                         EXT4_MOUNT2_##opt)
 #define ext4_set_bit                    ext2_set_bit
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
 #define ext4_clear_bit                  ext2_clear_bit
@@ -1060,6 +1087,7 @@ struct ext4_sb_info {
        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
        struct buffer_head **s_group_desc;
        unsigned int s_mount_opt;
+        unsigned int s_mount_opt2;
        unsigned int s_mount_flags;
        ext4_fsblk_t s_sb_block;
        uid_t s_resuid;
@@ -1087,7 +1115,6 @@ struct ext4_sb_info {
        struct completion s_kobj_unregister;
        /* Journaling */
-        struct inode *s_journal_inode;
        struct journal_s *s_journal;
        struct list_head s_orphan;
        struct mutex s_orphan_lock;
@@ -1120,10 +1147,7 @@ struct ext4_sb_info {
        /* for buddy allocator */
        struct ext4_group_info ***s_group_info;
        struct inode *s_buddy_cache;
-        long s_blocks_reserved;
-        spinlock_t s_reserve_lock;
        spinlock_t s_md_lock;
-        tid_t s_last_transaction;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
@@ -1141,7 +1165,6 @@ struct ext4_sb_info {
        unsigned long s_mb_last_start;
        /* stats for buddy allocator */
-        spinlock_t s_mb_pa_lock;
        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
        atomic_t s_bal_success; /* we found long enough chunks */
        atomic_t s_bal_allocated;       /* in blocks */
@@ -1172,6 +1195,11 @@ struct ext4_sb_info {
        /* timer for periodic error stats printing */
        struct timer_list s_err_report;
+        /* Lazy inode table initialization info */
+        struct ext4_li_request *s_li_request;
+        /* Wait multiplier for lazy initialization thread */
+        unsigned int s_li_wait_mult;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1210,24 +1238,39 @@ enum {
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
        EXT4_STATE_NEWENTRY,            /* File just added to dir */
+        EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
 };
-#define EXT4_INODE_BIT_FNS(name, field)                                 \
+#define EXT4_INODE_BIT_FNS(name, field, offset)                         \
 static inline int ext4_test_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        return test_bit(bit, &EXT4_I(inode)->i_##field);                \
+        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);     \
 }                                                                       \
 static inline void ext4_set_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        set_bit(bit, &EXT4_I(inode)->i_##field);                        \
+        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);             \
 }                                                                       \
 static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
 {                                                                       \
-        clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
+        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);           \
 }
-EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(flag, flags, 0)
-EXT4_INODE_BIT_FNS(state, state_flags)
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        (ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        /* We depend on the fact that callers will set i_flags */
+}
+#endif
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1533,7 +1576,42 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                        ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-extern struct proc_dir_entry *ext4_proc_root;
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT                   10
+#define EXT4_DEF_LI_MAX_START_DELAY             5
+#define EXT4_LAZYINIT_QUIT                      0x0001
+#define EXT4_LAZYINIT_RUNNING                   0x0002
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+        unsigned long           li_state;
+        wait_queue_head_t       li_wait_daemon;
+        wait_queue_head_t       li_wait_task;
+        struct timer_list       li_timer;
+        struct task_struct      *li_task;
+        struct list_head        li_request_list;
+        struct mutex            li_list_mtx;
+};
+struct ext4_li_request {
+        struct super_block      *lr_super;
+        struct ext4_sb_info     *lr_sbi;
+        ext4_group_t            lr_next_group;
+        struct list_head        lr_request;
+        unsigned long           lr_next_sched;
+        unsigned long           lr_timeout;
+};
+struct ext4_features {
+        struct kobject f_kobj;
+        struct completion f_kobj_unregister;
+};
 /*
 * Function prototypes
@@ -1561,7 +1639,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1581,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, de, bh, offset) \
+#define ext4_check_dir_entry(dir, filp, de, bh, offset)                 \
-        __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
+        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+                                        (de), (bh), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
@@ -1592,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
 extern int ext4_sync_file(struct file *, int);
+extern int ext4_flush_completed_IO(struct inode *);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1605,11 +1685,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-                                       struct buffer_head *bh,
+extern int ext4_init_inode_table(struct super_block *sb,
-                                       ext4_group_t group,
+                                 ext4_group_t group, int barrier);
-                                       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1620,16 +1698,15 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
-extern int __init init_ext4_mballoc(void);
+extern int __init ext4_init_mballoc(void);
-extern void exit_ext4_mballoc(void);
+extern void ext4_exit_mballoc(void);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
-extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
-extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
-                                                ext4_group_t, int);
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
@@ -1657,13 +1734,11 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_completed_IO(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
 /* ioctl.c */
@@ -1696,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
                             ext4_fsblk_t, const char *, ...)
        __attribute__ ((format (printf, 5, 6)));
 extern void ext4_error_file(struct file *, const char *, unsigned int,
-                            const char *, ...)
+                            ext4_fsblk_t, const char *, ...)
-        __attribute__ ((format (printf, 4, 5)));
+        __attribute__ ((format (printf, 5, 6)));
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
 extern void __ext4_abort(struct super_block *, const char *, unsigned int,
@@ -1960,6 +2035,7 @@ extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
@@ -1973,8 +2049,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* block_validity */
 extern void ext4_release_system_zone(struct super_block *sb);
 extern int ext4_setup_system_zone(struct super_block *sb);
-extern int __init init_ext4_system_zone(void);
+extern int __init ext4_init_system_zone(void);
-extern void exit_ext4_system_zone(void);
+extern void ext4_exit_system_zone(void);
 extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
                                 ext4_fsblk_t start_blk,
                                 unsigned int count);
@@ -1989,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
-extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
@@ -2002,6 +2078,18 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
+extern void ext4_free_io_end(ext4_io_end_t *io);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern int ext4_end_io_nolock(ext4_io_end_t *io);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+                               struct page *page,
+                               int len,
+                               struct writeback_control *wbc);
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bdb6ce7e2eb4..2e29abb30f76 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,10 +119,6 @@ struct ext4_ext_path {
 * structure for external API
 */
-#define EXT4_EXT_CACHE_NO       0
-#define EXT4_EXT_CACHE_GAP      1
-#define EXT4_EXT_CACHE_EXTENT   2
 /*
 * to be called by ext4_ext_walk_space()
 * negative retcode - error
@@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode)
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
-        EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
+        EXT4_I(inode)->i_cached_extent.ec_len = 0;
 }
 static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -225,11 +221,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
        ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
 }
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+        ext4_fsblk_t block;
+        block = le32_to_cpu(ex->ee_start_lo);
+        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+        return block;
+}
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+        ext4_fsblk_t block;
+        block = le32_to_cpu(ix->ei_leaf_lo);
+        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+        return block;
+}
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+                                         ext4_fsblk_t pb)
+{
+        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+                                      0xffff);
+}
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+                                         ext4_fsblk_t pb)
+{
+        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+                                     0xffff);
+}
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
-                                         sector_t lblocks);
+                                         ext4_lblk_t lblocks);
-extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
-extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
-extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
@@ -237,19 +282,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 extern int ext4_can_extents_be_merged(struct inode *inode,
                                      struct ext4_extent *ex1,
                                      struct ext4_extent *ex2);
-extern int ext4_ext_try_to_merge(struct inode *inode,
-                                 struct ext4_ext_path *path,
-                                 struct ext4_extent *);
-extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
-                                                        ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
                                                        struct ext4_ext_path *);
-extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
-                                                ext4_lblk_t *, ext4_fsblk_t *);
-extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
-                                                ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c5..d8b992e658c1 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
        if (ext4_handle_valid(handle))
-                return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+                return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
        return 0;
 }
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06328d3e5717..63a75810b7c3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,55 +44,6 @@
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
-/*
- * ext_pblock:
- * combine low and high parts of physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
-{
-        ext4_fsblk_t block;
-        block = le32_to_cpu(ex->ee_start_lo);
-        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
-        return block;
-}
-/*
- * idx_pblock:
- * combine low and high parts of a leaf physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
-{
-        ext4_fsblk_t block;
-        block = le32_to_cpu(ix->ei_leaf_lo);
-        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
-        return block;
-}
-/*
- * ext4_ext_store_pblock:
- * stores a large physical block number into an extent struct,
- * breaking it into parts
- */
-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
-{
-        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
-/*
- * ext4_idx_store_pblock:
- * stores a large physical block number into an index struct,
- * breaking it into parts
- */
-static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
-{
-        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -166,10 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                struct ext4_extent *ex;
                depth = path->p_depth;
-                /* try to predict block placement */
+                /*
+                 * Try to predict block placement assuming that we are
+                 * filling in a file which will eventually be
+                 * non-sparse --- i.e., in the case of libbfd writing
+                 * an ELF object sections out-of-order but in a way
+                 * the eventually results in a contiguous object or
+                 * executable file, or some database extending a table
+                 * space file.  However, this is actually somewhat
+                 * non-ideal if we are writing a sparse file such as
+                 * qemu or KVM writing a raw image file that is going
+                 * to stay fairly sparse, since it will end up
+                 * fragmenting the file system's free space.  Maybe we
+                 * should have some hueristics or some way to allow
+                 * userspace to pass a hint to file system,
+                 * especiially if the latter case turns out to be
+                 * common.
+                 */
                ex = path[depth].p_ext;
-                if (ex)
+                if (ex) {
-                        return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
+                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+                        if (block > ext_block)
+                                return ext_pblk + (block - ext_block);
+                        else
+                                return ext_pblk - (ext_block - block);
+                }
                /* it looks like index is empty;
                 * try to find starting block from index itself */
@@ -292,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 * to allocate @blocks
 * Worse case is one block per extent
 */
-int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        int idxs, num = 0;
@@ -354,7 +328,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-        ext4_fsblk_t block = ext_pblock(ext);
+        ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +337,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
 {
-        ext4_fsblk_t block = idx_pblock(ext_idx);
+        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
@@ -463,13 +437,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
        for (k = 0; k <= l; k++, path++) {
                if (path->p_idx) {
                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
-                            idx_pblock(path->p_idx));
+                            ext4_idx_pblock(path->p_idx));
                } else if (path->p_ext) {
                        ext_debug("  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
                                  ext4_ext_is_uninitialized(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
-                                  ext_pblock(path->p_ext));
+                                  ext4_ext_pblock(path->p_ext));
                } else
                        ext_debug("  []");
        }
@@ -494,7 +468,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                          ext4_ext_is_uninitialized(ex),
-                          ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
        }
        ext_debug("\n");
 }
@@ -545,7 +519,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
        path->p_idx = l - 1;
        ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
-                  idx_pblock(path->p_idx));
+                  ext4_idx_pblock(path->p_idx));
 #ifdef CHECK_BINSEARCH
        {
@@ -614,7 +588,7 @@ ext4_ext_binsearch(struct inode *inode,
        path->p_ext = l - 1;
        ext_debug("  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
-                        ext_pblock(path->p_ext),
+                        ext4_ext_pblock(path->p_ext),
                        ext4_ext_is_uninitialized(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));
@@ -682,7 +656,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
                ext4_ext_binsearch_idx(inode, path + ppos, block);
-                path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;
@@ -721,7 +695,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        ext4_ext_binsearch(inode, path + ppos, block);
        /* if not an empty leaf */
        if (path[ppos].p_ext)
-                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
        ext4_ext_show_path(inode, path);
@@ -739,9 +713,9 @@ err:
 * insert new index [@logical;@ptr] into the block at @curp;
 * check where to insert: before @curp or after @curp
 */
-int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
-                                struct ext4_ext_path *curp,
+                                 struct ext4_ext_path *curp,
-                                int logical, ext4_fsblk_t ptr)
+                                 int logical, ext4_fsblk_t ptr)
 {
        struct ext4_extent_idx *ix;
        int len, err;
@@ -917,7 +891,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
                                le32_to_cpu(path[depth].p_ext->ee_block),
-                                ext_pblock(path[depth].p_ext),
+                                ext4_ext_pblock(path[depth].p_ext),
                                ext4_ext_is_uninitialized(path[depth].p_ext),
                                ext4_ext_get_actual_len(path[depth].p_ext),
                                newblock);
@@ -1007,7 +981,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
                                        le32_to_cpu(path[i].p_idx->ei_block),
-                                        idx_pblock(path[i].p_idx),
+                                        ext4_idx_pblock(path[i].p_idx),
                                        newblock);
                        /*memmove(++fidx, path[i].p_idx++,
                                        sizeof(struct ext4_extent_idx));
@@ -1146,7 +1120,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
-                  idx_pblock(EXT_FIRST_INDEX(neh)));
+                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
        err = ext4_ext_dirty(handle, inode, curp);
@@ -1232,9 +1206,9 @@ out:
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
-int
+static int ext4_ext_search_left(struct inode *inode,
-ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
+                                struct ext4_ext_path *path,
-                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
+                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
@@ -1286,7 +1260,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
        }
        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
-        *phys = ext_pblock(ex) + ee_len - 1;
+        *phys = ext4_ext_pblock(ex) + ee_len - 1;
        return 0;
 }
@@ -1297,9 +1271,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
-int
+static int ext4_ext_search_right(struct inode *inode,
-ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
+                                 struct ext4_ext_path *path,
-                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
+                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
@@ -1342,7 +1316,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
                        }
                }
                *logical = le32_to_cpu(ex->ee_block);
-                *phys = ext_pblock(ex);
+                *phys = ext4_ext_pblock(ex);
                return 0;
        }
@@ -1357,7 +1331,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
                /* next allocated block in this leaf */
                ex++;
                *logical = le32_to_cpu(ex->ee_block);
-                *phys = ext_pblock(ex);
+                *phys = ext4_ext_pblock(ex);
                return 0;
        }
@@ -1376,7 +1350,7 @@ got_index:
         * follow it and find the closest allocated
         * block to the right */
        ix++;
-        block = idx_pblock(ix);
+        block = ext4_idx_pblock(ix);
        while (++depth < path->p_depth) {
                bh = sb_bread(inode->i_sb, block);
                if (bh == NULL)
@@ -1388,7 +1362,7 @@ got_index:
                        return -EIO;
                }
                ix = EXT_FIRST_INDEX(eh);
-                block = idx_pblock(ix);
+                block = ext4_idx_pblock(ix);
                put_bh(bh);
        }
@@ -1402,7 +1376,7 @@ got_index:
        }
        ex = EXT_FIRST_EXTENT(eh);
        *logical = le32_to_cpu(ex->ee_block);
-        *phys = ext_pblock(ex);
+        *phys = ext4_ext_pblock(ex);
        put_bh(bh);
        return 0;
 }
@@ -1573,7 +1547,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
                return 0;
 #endif
-        if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                return 1;
        return 0;
 }
@@ -1585,9 +1559,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
 * 1 if they got merged.
 */
-int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge(struct inode *inode,
-                          struct ext4_ext_path *path,
+                                 struct ext4_ext_path *path,
-                          struct ext4_extent *ex)
+                                 struct ext4_extent *ex)
 {
        struct ext4_extent_header *eh;
        unsigned int depth, len;
@@ -1632,9 +1606,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
 * such that there will be no overlap, and then returns 1.
 * If there is no overlap found, it returns 0.
 */
-unsigned int ext4_ext_check_overlap(struct inode *inode,
+static unsigned int ext4_ext_check_overlap(struct inode *inode,
-                                    struct ext4_extent *newext,
+                                           struct ext4_extent *newext,
-                                    struct ext4_ext_path *path)
+                                           struct ext4_ext_path *path)
 {
        ext4_lblk_t b1, b2;
        unsigned int depth, len1;
@@ -1706,11 +1680,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
-                                ext4_ext_is_uninitialized(newext),
+                          ext4_ext_is_uninitialized(newext),
-                                ext4_ext_get_actual_len(newext),
+                          ext4_ext_get_actual_len(newext),
-                                le32_to_cpu(ex->ee_block),
+                          le32_to_cpu(ex->ee_block),
-                                ext4_ext_is_uninitialized(ex),
+                          ext4_ext_is_uninitialized(ex),
-                                ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                          ext4_ext_get_actual_len(ex),
+                          ext4_ext_pblock(ex));
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        return err;
@@ -1780,7 +1755,7 @@ has_space:
                /* there is no extent in this leaf, create first one */
                ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
-                                ext_pblock(newext),
+                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext));
                path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1794,7 +1769,7 @@ has_space:
                        ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
                                        "move %d from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
-                                        ext_pblock(newext),
+                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_uninitialized(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex, len, nearex + 1, nearex + 2);
@@ -1808,7 +1783,7 @@ has_space:
                ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
                                "move %d from 0x%p to 0x%p\n",
                                le32_to_cpu(newext->ee_block),
-                                ext_pblock(newext),
+                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext),
                                nearex, len, nearex + 1, nearex + 2);
@@ -1819,7 +1794,7 @@ has_space:
        le16_add_cpu(&eh->eh_entries, 1);
        nearex = path[depth].p_ext;
        nearex->ee_block = newext->ee_block;
-        ext4_ext_store_pblock(nearex, ext_pblock(newext));
+        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;
 merge:
@@ -1845,9 +1820,9 @@ cleanup:
        return err;
 }
-int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
-                        ext4_lblk_t num, ext_prepare_callback func,
+                               ext4_lblk_t num, ext_prepare_callback func,
-                        void *cbdata)
+                               void *cbdata)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_ext_cache cbex;
@@ -1919,12 +1894,10 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                        cbex.ec_block = start;
                        cbex.ec_len = end - start;
                        cbex.ec_start = 0;
-                        cbex.ec_type = EXT4_EXT_CACHE_GAP;
                } else {
                        cbex.ec_block = le32_to_cpu(ex->ee_block);
                        cbex.ec_len = ext4_ext_get_actual_len(ex);
-                        cbex.ec_start = ext_pblock(ex);
+                        cbex.ec_start = ext4_ext_pblock(ex);
-                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
                if (unlikely(cbex.ec_len == 0)) {
@@ -1964,13 +1937,12 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
-                        __u32 len, ext4_fsblk_t start, int type)
+                        __u32 len, ext4_fsblk_t start)
 {
        struct ext4_ext_cache *cex;
        BUG_ON(len == 0);
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
-        cex->ec_type = type;
        cex->ec_block = block;
        cex->ec_len = len;
        cex->ec_start = start;
@@ -2023,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
        }
        ext_debug(" -> %u:%lu\n", lblock, len);
-        ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
+        ext4_ext_put_in_cache(inode, lblock, len, 0);
 }
+/*
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
 static int
 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                        struct ext4_extent *ex)
 {
        struct ext4_ext_cache *cex;
-        int ret = EXT4_EXT_CACHE_NO;
+        int ret = 0;
        /*
         * We borrow i_block_reservation_lock to protect i_cached_extent
@@ -2040,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
        cex = &EXT4_I(inode)->i_cached_extent;
        /* has cache valid data? */
-        if (cex->ec_type == EXT4_EXT_CACHE_NO)
+        if (cex->ec_len == 0)
                goto errout;
-        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
-                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
        if (in_range(block, cex->ec_block, cex->ec_len)) {
                ex->ee_block = cpu_to_le32(cex->ec_block);
                ext4_ext_store_pblock(ex, cex->ec_start);
@@ -2052,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                ext_debug("%u cached by %u:%u:%llu\n",
                                block,
                                cex->ec_block, cex->ec_len, cex->ec_start);
-                ret = cex->ec_type;
+                ret = 1;
        }
 errout:
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2073,7 +2046,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        /* free index block */
        path--;
-        leaf = idx_pblock(path->p_idx);
+        leaf = ext4_idx_pblock(path->p_idx);
        if (unlikely(path->p_hdr->eh_entries == 0)) {
                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
                return -EIO;
@@ -2181,7 +2154,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t start;
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
-                start = ext_pblock(ex) + ee_len - num;
+                start = ext4_ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
                ext4_free_blocks(handle, inode, 0, start, num, flags);
        } else if (from == le32_to_cpu(ex->ee_block)
@@ -2310,7 +2283,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        goto out;
                ext_debug("new extent: %u:%u:%llu\n", block, num,
-                                ext_pblock(ex));
+                                ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
                ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2421,9 +2394,9 @@ again:
                        struct buffer_head *bh;
                        /* go to the next level */
                        ext_debug("move to level %d (block %llu)\n",
-                                  i + 1, idx_pblock(path[i].p_idx));
+                                  i + 1, ext4_idx_pblock(path[i].p_idx));
                        memset(path + i + 1, 0, sizeof(*path));
-                        bh = sb_bread(sb, idx_pblock(path[i].p_idx));
+                        bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
                        if (!bh) {
                                /* should we reset i_size? */
                                err = -EIO;
@@ -2535,77 +2508,21 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
-static void bi_complete(struct bio *bio, int error)
-{
-        complete((struct completion *)bio->bi_private);
-}
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
+        ext4_fsblk_t ee_pblock;
+        unsigned int ee_len;
        int ret;
-        struct bio *bio;
-        int blkbits, blocksize;
-        sector_t ee_pblock;
-        struct completion event;
-        unsigned int ee_len, len, done, offset;
-        blkbits   = inode->i_blkbits;
-        blocksize = inode->i_sb->s_blocksize;
        ee_len    = ext4_ext_get_actual_len(ex);
-        ee_pblock = ext_pblock(ex);
+        ee_pblock = ext4_ext_pblock(ex);
-        /* convert ee_pblock to 512 byte sectors */
-        ee_pblock = ee_pblock << (blkbits - 9);
-        while (ee_len > 0) {
-                if (ee_len > BIO_MAX_PAGES)
-                        len = BIO_MAX_PAGES;
-                else
-                        len = ee_len;
-                bio = bio_alloc(GFP_NOIO, len);
-                if (!bio)
-                        return -ENOMEM;
-                bio->bi_sector = ee_pblock;
-                bio->bi_bdev   = inode->i_sb->s_bdev;
-                done = 0;
-                offset = 0;
-                while (done < len) {
-                        ret = bio_add_page(bio, ZERO_PAGE(0),
-                                                        blocksize, offset);
-                        if (ret != blocksize) {
-                                /*
-                                 * We can't add any more pages because of
-                                 * hardware limitations.  Start a new bio.
-                                 */
-                                break;
-                        }
-                        done++;
-                        offset += blocksize;
-                        if (offset >= PAGE_CACHE_SIZE)
-                                offset = 0;
-                }
-                init_completion(&event);
+        ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
-                bio->bi_private = &event;
+        if (ret > 0)
-                bio->bi_end_io = bi_complete;
+                ret = 0;
-                submit_bio(WRITE, bio);
-                wait_for_completion(&event);
-                if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+        return ret;
-                        bio_put(bio);
-                        return -EIO;
-                }
-                bio_put(bio);
-                ee_len    -= done;
-                ee_pblock += done  << (blkbits - 9);
-        }
-        return 0;
 }
 #define EXT4_EXT_ZERO_LEN 7
@@ -2651,12 +2568,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
        /*
         * It is safe to convert extent to initialized via explicit
@@ -2675,7 +2592,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zeroed the full extent */
                return allocated;
@@ -2710,7 +2627,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = cpu_to_le16(ee_len - allocated);
                        ext4_ext_mark_uninitialized(ex);
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        ex3 = &newex;
@@ -2725,7 +2642,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                        goto fix_extent_len;
                                ex->ee_block = orig_ex.ee_block;
                                ex->ee_len   = orig_ex.ee_len;
-                                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                                ext4_ext_store_pblock(ex,
+                                        ext4_ext_pblock(&orig_ex));
                                ext4_ext_dirty(handle, inode, path + depth);
                                /* blocks available from map->m_lblk */
                                return allocated;
@@ -2782,7 +2700,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
                        /* blocks available from map->m_lblk */
@@ -2833,7 +2751,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zero out the first half */
                        /* blocks available from map->m_lblk */
@@ -2902,7 +2820,7 @@ insert:
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zero out the first half */
                return allocated;
@@ -2915,7 +2833,7 @@ out:
 fix_extent_len:
        ex->ee_block = orig_ex.ee_block;
        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
        ext4_ext_mark_uninitialized(ex);
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
@@ -2927,14 +2845,14 @@ fix_extent_len:
 * to an uninitialized extent.
 *
 * Writing to an uninitized extent may result in splitting the uninitialized
- * extent into multiple /intialized unintialized extents (up to three)
+ * extent into multiple /initialized uninitialized extents (up to three)
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be uninitialized
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * One of more index blocks maybe needed if the extent tree grow after
- * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * the uninitialized extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the uninitialized extent before DIO submit
 * the IO. The uninitialized extent called at this time will be split
 * into three uninitialized extent(at most). After IO complete, the part
@@ -2973,12 +2891,12 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
        /*
         * It is safe to convert extent to initialized via explicit
@@ -3027,7 +2945,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
                        /* blocks available from map->m_lblk */
@@ -3099,7 +3017,7 @@ insert:
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zero out the first half */
                return allocated;
@@ -3112,7 +3030,7 @@ out:
 fix_extent_len:
        ex->ee_block = orig_ex.ee_block;
        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
        ext4_ext_mark_uninitialized(ex);
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
@@ -3180,6 +3098,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
                unmap_underlying_metadata(bdev, block + i);
 }
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+                              ext4_lblk_t lblk,
+                              struct ext4_ext_path *path,
+                              unsigned int len)
+{
+        int i, depth;
+        struct ext4_extent_header *eh;
+        struct ext4_extent *ex, *last_ex;
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+                return 0;
+        depth = ext_depth(inode);
+        eh = path[depth].p_hdr;
+        ex = path[depth].p_ext;
+        if (unlikely(!eh->eh_entries)) {
+                EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+                                 "EOFBLOCKS_FL set");
+                return -EIO;
+        }
+        last_ex = EXT_LAST_EXTENT(eh);
+        /*
+         * We should clear the EOFBLOCKS_FL flag if we are writing the
+         * last block in the last extent in the file.  We test this by
+         * first checking to see if the caller to
+         * ext4_ext_get_blocks() was interested in the last block (or
+         * a block beyond the last block) in the current extent.  If
+         * this turns out to be false, we can bail out from this
+         * function immediately.
+         */
+        if (lblk + len < le32_to_cpu(last_ex->ee_block) +
+            ext4_ext_get_actual_len(last_ex))
+                return 0;
+        /*
+         * If the caller does appear to be planning to write at or
+         * beyond the end of the current extent, we then test to see
+         * if the current extent is the last extent in the file, by
+         * checking to make sure it was reached via the rightmost node
+         * at each level of the tree.
+         */
+        for (i = depth-1; i >= 0; i--)
+                if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                        return 0;
+        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+        return ext4_mark_inode_dirty(handle, inode);
+}
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
@@ -3206,7 +3175,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * completed
                 */
                if (io)
-                        io->flag = EXT4_IO_UNWRITTEN;
+                        io->flag = EXT4_IO_END_UNWRITTEN;
                else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
@@ -3217,8 +3186,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
                ret = ext4_convert_unwritten_extents_endio(handle, inode,
                                                        path);
-                if (ret >= 0)
+                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
+                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
+                                                 path, map->m_len);
+                } else
+                        err = ret;
                goto out2;
        }
        /* buffered IO case */
@@ -3244,8 +3217,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        /* buffered write, writepage time, convert*/
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-        if (ret >= 0)
+        if (ret >= 0) {
                ext4_update_inode_fsync_trans(handle, inode, 1);
+                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+                                         map->m_len);
+                if (err < 0)
+                        goto out2;
+        }
 out:
        if (ret <= 0) {
                err = ret;
@@ -3292,6 +3271,7 @@ out2:
        }
        return err ? err : allocated;
 }
 /*
 * Block allocation/map/preallocation routine for extents based files
 *
@@ -3315,9 +3295,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
-        struct ext4_extent newex, *ex, *last_ex;
+        struct ext4_extent newex, *ex;
        ext4_fsblk_t newblock;
-        int i, err = 0, depth, ret, cache_type;
+        int err = 0, depth, ret;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3326,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                  map->m_lblk, map->m_len, inode->i_ino);
        /* check in cache */
-        cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
+        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
-        if (cache_type) {
+                if (!newex.ee_start_lo && !newex.ee_start_hi) {
-                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
@@ -3337,17 +3316,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                                goto out2;
                        }
                        /* we should allocate requested block */
-                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
+                } else {
                        /* block is already allocated */
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
-                                   + ext_pblock(&newex);
+                                   + ext4_ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
                        allocated = ext4_ext_get_actual_len(&newex) -
                                (map->m_lblk - le32_to_cpu(newex.ee_block));
                        goto out;
-                } else {
-                        BUG();
                }
        }
@@ -3379,7 +3356,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ex = path[depth].p_ext;
        if (ex) {
                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
-                ext4_fsblk_t ee_start = ext_pblock(ex);
+                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
                /*
@@ -3398,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        /* Do not put uninitialized extent in the cache */
                        if (!ext4_ext_is_uninitialized(ex)) {
                                ext4_ext_put_in_cache(inode, ee_block,
-                                                        ee_len, ee_start,
+                                                        ee_len, ee_start);
-                                                        EXT4_EXT_CACHE_EXTENT);
                                goto out;
                        }
                        ret = ext4_ext_handle_uninitialized_extents(handle,
@@ -3488,7 +3464,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                        if (io)
-                                io->flag = EXT4_IO_UNWRITTEN;
+                                io->flag = EXT4_IO_END_UNWRITTEN;
                        else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
@@ -3497,44 +3473,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
+        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
-                if (unlikely(!eh->eh_entries)) {
+        if (err)
-                        EXT4_ERROR_INODE(inode,
+                goto out2;
-                                         "eh->eh_entries == 0 and "
-                                         "EOFBLOCKS_FL set");
-                        err = -EIO;
-                        goto out2;
-                }
-                last_ex = EXT_LAST_EXTENT(eh);
-                /*
-                 * If the current leaf block was reached by looking at
-                 * the last index block all the way down the tree, and
-                 * we are extending the inode beyond the last extent
-                 * in the current leaf block, then clear the
-                 * EOFBLOCKS_FL flag.
-                 */
-                for (i = depth-1; i >= 0; i--) {
-                        if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-                                break;
-                }
-                if ((i < 0) &&
-                    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-                     ext4_ext_get_actual_len(last_ex)))
-                        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-        }
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
                /* free data blocks we just allocated */
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-                ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+                ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
        /* previous routine could use block we allocated */
-        newblock = ext_pblock(&newex);
+        newblock = ext4_ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
        if (allocated > map->m_len)
                allocated = map->m_len;
@@ -3552,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         * when it is _not_ an uninitialized extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
+                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
-                                                EXT4_EXT_CACHE_EXTENT);
                ext4_update_inode_fsync_trans(handle, inode, 1);
        } else
                ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3581,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode)
        int err = 0;
        /*
+         * finish any pending end_io work so we won't run the risk of
+         * converting any truncated blocks to initialized later
+         */
+        ext4_flush_completed_IO(inode);
+        /*
         * probably first extent we're gonna free will be last in block
         */
        err = ext4_writepage_trans_blocks(inode);
@@ -3667,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
 }
 /*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
 * operation, which gets called from sys_fallocate system call.
 * For block-mapped files, posix_fallocate should fall back to the method
 * of writing zeroes to the required new blocks (the same behavior which is
 * expected for file systems which do not support fallocate() system call).
 */
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        handle_t *handle;
        loff_t new_size;
        unsigned int max_blocks;
@@ -3684,6 +3645,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
        /*
         * currently supporting (pre)allocate mode for extent-based
         * files _only_
@@ -3691,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
-        /* preallocation to directories is currently not supported */
-        if (S_ISDIR(inode->i_mode))
-                return -ENODEV;
        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
@@ -3729,7 +3690,7 @@ retry:
                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
-                                    inode->i_ino, block, max_blocks);
+                                    inode->i_ino, map.m_lblk, max_blocks);
 #endif
                        ext4_mark_inode_dirty(handle, inode);
                        ret2 = ext4_journal_stop(handle);
@@ -3829,7 +3790,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
        logical =  (__u64)newex->ec_block << blksize_bits;
-        if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+        if (newex->ec_start == 0) {
                pgoff_t offset;
                struct page *page;
                struct buffer_head *bh = NULL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ee92b66d4558..2e8322c8aa88 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 {
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        struct ext4_inode_info *ei = EXT4_I(inode);
        struct vfsmount *mnt = filp->f_path.mnt;
        struct path path;
        char buf[64], *cp;
@@ -127,11 +128,74 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                        ext4_mark_super_dirty(sb);
                }
        }
+        /*
+         * Set up the jbd2_inode if we are opening the inode for
+         * writing and the journal is present
+         */
+        if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
+                struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
+                spin_lock(&inode->i_lock);
+                if (!ei->jinode) {
+                        if (!jinode) {
+                                spin_unlock(&inode->i_lock);
+                                return -ENOMEM;
+                        }
+                        ei->jinode = jinode;
+                        jbd2_journal_init_jbd_inode(ei->jinode, inode);
+                        jinode = NULL;
+                }
+                spin_unlock(&inode->i_lock);
+                if (unlikely(jinode != NULL))
+                        jbd2_free_inode(jinode);
+        }
        return dquot_file_open(inode, filp);
 }
+/*
+ * ext4_llseek() copied from generic_file_llseek() to handle both
+ * block-mapped and extent-mapped maxbytes values. This should
+ * otherwise be identical with generic_file_llseek().
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct inode *inode = file->f_mapping->host;
+        loff_t maxbytes;
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+        else
+                maxbytes = inode->i_sb->s_maxbytes;
+        mutex_lock(&inode->i_mutex);
+        switch (origin) {
+        case SEEK_END:
+                offset += inode->i_size;
+                break;
+        case SEEK_CUR:
+                if (offset == 0) {
+                        mutex_unlock(&inode->i_mutex);
+                        return file->f_pos;
+                }
+                offset += file->f_pos;
+                break;
+        }
+        if (offset < 0 || offset > maxbytes) {
+                mutex_unlock(&inode->i_mutex);
+                return -EINVAL;
+        }
+        if (offset != file->f_pos) {
+                file->f_pos = offset;
+                file->f_version = 0;
+        }
+        mutex_unlock(&inode->i_mutex);
+        return offset;
+}
 const struct file_operations ext4_file_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = ext4_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
@@ -146,6 +210,7 @@ const struct file_operations ext4_file_operations = {
        .fsync          = ext4_sync_file,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
+        .fallocate      = ext4_fallocate,
 };
 const struct inode_operations ext4_file_inode_operations = {
@@ -159,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .check_acl      = ext4_check_acl,
-        .fallocate      = ext4_fallocate,
        .fiemap         = ext4_fiemap,
 };
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..7829b287822a 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,6 +34,89 @@
 #include <trace/events/ext4.h>
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef  EXT4_DEBUG
+        struct list_head *cur, *before, *after;
+        ext4_io_end_t *io, *io0, *io1;
+        unsigned long flags;
+        if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+                ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+                return;
+        }
+        ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+                cur = &io->list;
+                before = cur->prev;
+                io0 = container_of(before, ext4_io_end_t, list);
+                after = cur->next;
+                io1 = container_of(after, ext4_io_end_t, list);
+                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+                            io, inode->i_ino, io0, io1);
+        }
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+#endif
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+extern int ext4_flush_completed_IO(struct inode *inode)
+{
+        ext4_io_end_t *io;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        unsigned long flags;
+        int ret = 0;
+        int ret2 = 0;
+        if (list_empty(&ei->i_completed_io_list))
+                return ret;
+        dump_completed_IO(inode);
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        while (!list_empty(&ei->i_completed_io_list)){
+                io = list_entry(ei->i_completed_io_list.next,
+                                ext4_io_end_t, list);
+                /*
+                 * Calling ext4_end_io_nolock() to convert completed
+                 * IO to written.
+                 *
+                 * When ext4_sync_file() is called, run_queue() may already
+                 * about to flush the work corresponding to this io structure.
+                 * It will be upset if it founds the io structure related
+                 * to the work-to-be schedule is freed.
+                 *
+                 * Thus we need to keep the io structure still valid here after
+                 * convertion finished. The io structure has a flag to
+                 * avoid double converting from both fsync and background work
+                 * queue work.
+                 */
+                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+                ret = ext4_end_io_nolock(io);
+                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+                if (ret < 0)
+                        ret2 = ret;
+                else
+                        list_del_init(&io->list);
+        }
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        return (ret2 < 0) ? ret2 : 0;
+}
 /*
 * If we're not journaling and this is a just-created file, we have to
 * sync our parent directory (if it was freshly created) since
@@ -86,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync)
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
-        ret = flush_completed_IO(inode);
+        ret = ext4_flush_completed_IO(inode);
        if (ret < 0)
                return ret;
@@ -128,10 +211,9 @@ int ext4_sync_file(struct file *file, int datasync)
                    (journal->j_fs_dev != journal->j_dev) &&
                    (journal->j_flags & JBD2_BARRIER))
                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-                                        NULL, BLKDEV_IFL_WAIT);
+                                        NULL);
                ret = jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
-                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..eb9097aec6f0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -50,7 +50,7 @@
 * need to use it within a single byte (to ensure we get endianness right).
 * We can use memset for the rest of the bitmap as there are no other users.
 */
-void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 {
        int i;
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 /* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
-                                ext4_group_t block_group,
+                                       struct buffer_head *bh,
-                                struct ext4_group_desc *gdp)
+                                       ext4_group_t block_group,
+                                       struct ext4_group_desc *gdp)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
        }
        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
        return EXT4_INODES_PER_GROUP(sb);
@@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return NULL;
        bitmap_blk = ext4_inode_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
@@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                return bh;
        }
        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -411,8 +415,8 @@ struct orlov_stats {
 * for a particular block group or flex_bg.  If flex_size is 1, then g
 * is a block group number; otherwise it is flex_bg number.
 */
-void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
-                       int flex_size, struct orlov_stats *stats)
+                            int flex_size, struct orlov_stats *stats)
 {
        struct ext4_group_desc *desc;
        struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@ -712,8 +716,17 @@ static int ext4_claim_inode(struct super_block *sb,
 {
        int free = 0, retval = 0, count;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+        /*
+         * We have to be sure that new inode allocation does not race with
+         * inode table initialization, because otherwise we may end up
+         * allocating and writing new inode right before sb_issue_zeroout
+         * takes place and overwriting our new inode with zeroes. So we
+         * take alloc_sem to prevent it.
+         */
+        down_read(&grp->alloc_sem);
        ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
@@ -724,6 +737,7 @@ static int ext4_claim_inode(struct super_block *sb,
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
                ext4_unlock_group(sb, group);
+                up_read(&grp->alloc_sem);
                ext4_error(sb, "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
                           ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +786,7 @@ static int ext4_claim_inode(struct super_block *sb,
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
        ext4_unlock_group(sb, group);
+        up_read(&grp->alloc_sem);
        return retval;
 }
@@ -1012,7 +1027,7 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
        ext4_set_inode_state(inode, EXT4_STATE_NEW);
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
@@ -1205,3 +1220,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
        }
        return count;
 }
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_claim_inode until we are finished.
+ */
+extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+                                 int barrier)
+{
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp = NULL;
+        struct buffer_head *group_desc_bh;
+        handle_t *handle;
+        ext4_fsblk_t blk;
+        int num, ret = 0, used_blks = 0;
+        /* This should not happen, but just to be sure check this */
+        if (sb->s_flags & MS_RDONLY) {
+                ret = 1;
+                goto out;
+        }
+        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+        if (!gdp)
+                goto out;
+        /*
+         * We do not need to lock this, because we are the only one
+         * handling this flag.
+         */
+        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+                goto out;
+        handle = ext4_journal_start_sb(sb, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        down_write(&grp->alloc_sem);
+        /*
+         * If inode bitmap was already initialized there may be some
+         * used inodes so we need to skip blocks with used inodes in
+         * inode table.
+         */
+        if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+                used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+                            ext4_itable_unused_count(sb, gdp)),
+                            sbi->s_inodes_per_block);
+        if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+                ext4_error(sb, "Something is wrong with group %u\n"
+                           "Used itable blocks: %d"
+                           "itable unused count: %u\n",
+                           group, used_blks,
+                           ext4_itable_unused_count(sb, gdp));
+                ret = 1;
+                goto out;
+        }
+        blk = ext4_inode_table(sb, gdp) + used_blks;
+        num = sbi->s_itb_per_group - used_blks;
+        BUFFER_TRACE(group_desc_bh, "get_write_access");
+        ret = ext4_journal_get_write_access(handle,
+                                            group_desc_bh);
+        if (ret)
+                goto err_out;
+        /*
+         * Skip zeroout if the inode table is full. But we set the ZEROED
+         * flag anyway, because obviously, when it is full it does not need
+         * further zeroing.
+         */
+        if (unlikely(num == 0))
+                goto skip_zeroout;
+        ext4_debug("going to zero out inode table in group %d\n",
+                   group);
+        ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+        if (ret < 0)
+                goto err_out;
+        if (barrier)
+                blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+skip_zeroout:
+        ext4_lock_group(sb, group);
+        gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+        ext4_unlock_group(sb, group);
+        BUFFER_TRACE(group_desc_bh,
+                     "call ext4_handle_dirty_metadata");
+        ret = ext4_handle_dirty_metadata(handle, NULL,
+                                         group_desc_bh);
+err_out:
+        up_write(&grp->alloc_sem);
+        ext4_journal_stop(handle);
+out:
+        return ret;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..9f7f9e49914f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -53,13 +55,27 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
-        return jbd2_journal_begin_ordered_truncate(
+        trace_ext4_begin_ordered_truncate(inode, new_size);
-                                        EXT4_SB(inode->i_sb)->s_journal,
+        /*
-                                        &EXT4_I(inode)->jinode,
+         * If jinode is zero, then we never opened the file for
-                                        new_size);
+         * writing, so there's no need to call
+         * jbd2_journal_begin_ordered_truncate() since there's no
+         * outstanding writes we need to flush.
+         */
+        if (!EXT4_I(inode)->jinode)
+                return 0;
+        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+                                                   EXT4_I(inode)->jinode,
+                                                   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 /*
 * Test whether an inode is a fast symlink.
@@ -172,6 +188,7 @@ void ext4_evict_inode(struct inode *inode)
        handle_t *handle;
        int err;
+        trace_ext4_evict_inode(inode);
        if (inode->i_nlink) {
                truncate_inode_pages(&inode->i_data, 0);
                goto no_delete;
@@ -544,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 }
 /**
- *      ext4_blks_to_allocate: Look up the block map and count the number
+ *      ext4_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -583,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 /**
 *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: inode which needs allocated blocks
+ *      @iblock: the logical block to start allocated at
+ *      @goal: preferred physical block of allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks: number of desired blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: on return it will store the error code
- *              direct blocks
+ *
+ *      This function will return the number of blocks allocated as
+ *      requested by the passed-in parameters.
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                             ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -703,9 +726,11 @@ failed_out:
 /**
 *      ext4_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -755,6 +780,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                 * parent to disk.
                 */
                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+                if (unlikely(!bh)) {
+                        err = -EIO;
+                        goto failed;
+                }
                branch[n].bh = bh;
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
@@ -813,6 +843,7 @@ failed:
 /**
 * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
 * @chain: chain of indirect blocks (with a missing link - see
@@ -1068,7 +1099,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 * Calculate the number of metadata blocks need to reserve
 * to allocate a block located at @lblock
 */
-static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1207,8 +1238,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
                                break;
                        idx++;
                        num++;
-                        if (num >= max_pages)
+                        if (num >= max_pages) {
+                                done = 1;
                                break;
+                        }
                }
                pagevec_release(&pvec);
        }
@@ -1305,7 +1338,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * avoid double accounting
         */
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+                ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
@@ -1335,7 +1368,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1538,10 +1571,10 @@ static int do_journal_get_write_access(handle_t *handle,
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
-         * __block_prepare_write() could have dirtied some buffers. Clean
+         * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
-         * by __block_prepare_write() isn't a real problem here as we clear
+         * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
@@ -1863,7 +1896,7 @@ static int ext4_journalled_write_end(struct file *file,
 /*
 * Reserve a single block located at lblock
 */
-static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1995,16 +2028,23 @@ static void ext4_da_page_release_reservation(struct page *page,
 *
 * As pages are already locked by write_cache_pages(), we can't use it
 */
-static int mpage_da_submit_io(struct mpage_da_data *mpd)
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+                              struct ext4_map_blocks *map)
 {
-        long pages_skipped;
        struct pagevec pvec;
        unsigned long index, end;
        int ret = 0, err, nr_pages, i;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+        loff_t size = i_size_read(inode);
+        unsigned int len, block_start;
+        struct buffer_head *bh, *page_bufs = NULL;
+        int journal_data = ext4_should_journal_data(inode);
+        sector_t pblock = 0, cur_logical = 0;
+        struct ext4_io_submit io_submit;
        BUG_ON(mpd->next_page <= mpd->first_page);
+        memset(&io_submit, 0, sizeof(io_submit));
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,122 +2060,111 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
+                        int commit_write = 0, redirty_page = 0;
                        struct page *page = pvec.pages[i];
                        index = page->index;
                        if (index > end)
                                break;
+                        if (index == size >> PAGE_CACHE_SHIFT)
+                                len = size & ~PAGE_CACHE_MASK;
+                        else
+                                len = PAGE_CACHE_SIZE;
+                        if (map) {
+                                cur_logical = index << (PAGE_CACHE_SHIFT -
+                                                        inode->i_blkbits);
+                                pblock = map->m_pblk + (cur_logical -
+                                                        map->m_lblk);
+                        }
                        index++;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                        pages_skipped = mpd->wbc->pages_skipped;
-                        err = mapping->a_ops->writepage(page, mpd->wbc);
-                        if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-                                /*
-                                 * have successfully written the page
-                                 * without skipping the same
-                                 */
-                                mpd->pages_written++;
                        /*
-                         * In error case, we have to continue because
+                         * If the page does not have buffers (for
-                         * remaining pages are still locked
+                         * whatever reason), try to create them using
-                         * XXX: unlock and re-dirty them?
+                         * __block_write_begin.  If this fails,
+                         * redirty the page and move on.
                         */
-                        if (ret == 0)
+                        if (!page_has_buffers(page)) {
-                                ret = err;
+                                if (__block_write_begin(page, 0, len,
-                }
+                                                noalloc_get_block_write)) {
-                pagevec_release(&pvec);
+                                redirty_page:
-        }
+                                        redirty_page_for_writepage(mpd->wbc,
-        return ret;
+                                                                   page);
-}
+                                        unlock_page(page);
+                                        continue;
-/*
+                                }
- * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+                                commit_write = 1;
- *
+                        }
- * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
- */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                 struct ext4_map_blocks *map)
-{
-        struct inode *inode = mpd->inode;
-        struct address_space *mapping = inode->i_mapping;
-        int blocks = map->m_len;
-        sector_t pblock = map->m_pblk, cur_logical;
-        struct buffer_head *head, *bh;
-        pgoff_t index, end;
-        struct pagevec pvec;
-        int nr_pages, i;
-        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        pagevec_init(&pvec, 0);
-        while (index <= end) {
-                /* XXX: optimize tail */
-                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        struct page *page = pvec.pages[i];
-                        index = page->index;
-                        if (index > end)
-                                break;
-                        index++;
-                        BUG_ON(!PageLocked(page));
-                        BUG_ON(PageWriteback(page));
-                        BUG_ON(!page_has_buffers(page));
-                        bh = page_buffers(page);
-                        head = bh;
-                        /* skip blocks out of the range */
-                        do {
-                                if (cur_logical >= map->m_lblk)
-                                        break;
-                                cur_logical++;
-                        } while ((bh = bh->b_this_page) != head);
+                        bh = page_bufs = page_buffers(page);
+                        block_start = 0;
                        do {
-                                if (cur_logical >= map->m_lblk + blocks)
+                                if (!bh)
-                                        break;
+                                        goto redirty_page;
+                                if (map && (cur_logical >= map->m_lblk) &&
-                                if (buffer_delay(bh) || buffer_unwritten(bh)) {
+                                    (cur_logical <= (map->m_lblk +
+                                                     (map->m_len - 1)))) {
-                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
                                        if (buffer_delay(bh)) {
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
-                                        } else {
-                                                /*
-                                                 * unwritten already should have
-                                                 * blocknr assigned. Verify that
-                                                 */
-                                                clear_buffer_unwritten(bh);
-                                                BUG_ON(bh->b_blocknr != pblock);
                                        }
+                                        if (buffer_unwritten(bh) ||
+                                            buffer_mapped(bh))
+                                                BUG_ON(bh->b_blocknr != pblock);
+                                        if (map->m_flags & EXT4_MAP_UNINIT)
+                                                set_buffer_uninit(bh);
+                                        clear_buffer_unwritten(bh);
+                                }
-                                } else if (buffer_mapped(bh))
+                                /* redirty page if block allocation undone */
-                                        BUG_ON(bh->b_blocknr != pblock);
+                                if (buffer_delay(bh) || buffer_unwritten(bh))
+                                        redirty_page = 1;
-                                if (map->m_flags & EXT4_MAP_UNINIT)
+                                bh = bh->b_this_page;
-                                        set_buffer_uninit(bh);
+                                block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
-                        } while ((bh = bh->b_this_page) != head);
+                        } while (bh != page_bufs);
+                        if (redirty_page)
+                                goto redirty_page;
+                        if (commit_write)
+                                /* mark the buffer_heads as dirty & uptodate */
+                                block_commit_write(page, 0, len);
+                        /*
+                         * Delalloc doesn't support data journalling,
+                         * but eventually maybe we'll lift this
+                         * restriction.
+                         */
+                        if (unlikely(journal_data && PageChecked(page)))
+                                err = __ext4_journalled_writepage(page, len);
+                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
+                                err = ext4_bio_write_page(&io_submit, page,
+                                                          len, mpd->wbc);
+                        else
+                                err = block_write_full_page(page,
+                                        noalloc_get_block_write, mpd->wbc);
+                        if (!err)
+                                mpd->pages_written++;
+                        /*
+                         * In error case, we have to continue because
+                         * remaining pages are still locked
+                         */
+                        if (ret == 0)
+                                ret = err;
                }
                pagevec_release(&pvec);
        }
+        ext4_io_submit(&io_submit);
+        return ret;
 }
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                        sector_t logical, long blk_cnt)
 {
@@ -2187,35 +2216,32 @@ static void ext4_print_free_blocks(struct inode *inode)
 }
 /*
- * mpage_da_map_blocks - go through given space
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
 *
 * @mpd - bh describing space
 *
 * The function skips space we know is already mapped to disk blocks.
 *
 */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-        struct ext4_map_blocks map;
+        struct ext4_map_blocks map, *mapp = NULL;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
        handle_t *handle = NULL;
        /*
-         * We consider only non-mapped and non-allocated blocks
+         * If the blocks are mapped already, or we couldn't accumulate
+         * any blocks, then proceed immediately to the submission stage.
         */
-        if ((mpd->b_state  & (1 << BH_Mapped)) &&
+        if ((mpd->b_size == 0) ||
-                !(mpd->b_state & (1 << BH_Delay)) &&
+            ((mpd->b_state  & (1 << BH_Mapped)) &&
-                !(mpd->b_state & (1 << BH_Unwritten)))
+             !(mpd->b_state & (1 << BH_Delay)) &&
-                return 0;
+             !(mpd->b_state & (1 << BH_Unwritten))))
+                goto submit_io;
-        /*
-         * If we didn't accumulate anything to write simply return
-         */
-        if (!mpd->b_size)
-                return 0;
        handle = ext4_journal_current_handle();
        BUG_ON(!handle);
@@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * affects functions in many different parts of the allocation
         * call path.  This flag exists primarily because we don't
         * want to change *many* call functions, so ext4_map_blocks()
-         * will set the magic i_delalloc_reserved_flag once the
+         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
         * inode's allocation semaphore is taken.
         *
         * If the blocks in questions were delalloc blocks, set
@@ -2252,17 +2278,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                err = blks;
                /*
-                 * If get block returns with error we simply
+                 * If get block returns EAGAIN or ENOSPC and there
-                 * return. Later writepage will redirty the page and
+                 * appears to be free blocks we will call
-                 * writepages will find the dirty page again
+                 * ext4_writepage() for all of the pages which will
+                 * just redirty the pages.
                 */
                if (err == -EAGAIN)
-                        return 0;
+                        goto submit_io;
                if (err == -ENOSPC &&
                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
-                        return 0;
+                        goto submit_io;
                }
                /*
@@ -2287,10 +2314,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
-                return err;
+                return;
        }
        BUG_ON(blks == 0);
+        mapp = &map;
        if (map.m_flags & EXT4_MAP_NEW) {
                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
                int i;
@@ -2299,18 +2327,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
        }
-        /*
-         * If blocks are delayed marked, we need to
-         * put actual blocknr and drop delayed bit
-         */
-        if ((mpd->b_state & (1 << BH_Delay)) ||
-            (mpd->b_state & (1 << BH_Unwritten)))
-                mpage_put_bnr_to_bhs(mpd, &map);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
                if (err)
-                        return err;
+                        /* This only happens if the journal is aborted */
+                        return;
        }
        /*
@@ -2321,10 +2342,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                disksize = i_size_read(mpd->inode);
        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
                ext4_update_i_disksize(mpd->inode, disksize);
-                return ext4_mark_inode_dirty(handle, mpd->inode);
+                err = ext4_mark_inode_dirty(handle, mpd->inode);
+                if (err)
+                        ext4_error(mpd->inode->i_sb,
+                                   "Failed to mark inode %lu dirty",
+                                   mpd->inode->i_ino);
        }
-        return 0;
+submit_io:
+        mpage_da_submit_io(mpd, mapp);
+        mpd->io_done = 1;
 }
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2428,7 @@ flush_it:
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
-        if (mpage_da_map_blocks(mpd) == 0)
+        mpage_da_map_and_submit(mpd);
-                mpage_da_submit_io(mpd);
-        mpd->io_done = 1;
        return;
 }
@@ -2422,9 +2447,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 * The function finds extents of pages and scan them for all blocks.
 */
 static int __mpage_da_writepage(struct page *page,
-                                struct writeback_control *wbc, void *data)
+                                struct writeback_control *wbc,
+                                struct mpage_da_data *mpd)
 {
-        struct mpage_da_data *mpd = data;
        struct inode *inode = mpd->inode;
        struct buffer_head *bh, *head;
        sector_t logical;
@@ -2435,15 +2460,13 @@ static int __mpage_da_writepage(struct page *page,
        if (mpd->next_page != page->index) {
                /*
                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them using writepage()
+                 * and start IO on them
                 */
                if (mpd->next_page != mpd->first_page) {
-                        if (mpage_da_map_blocks(mpd) == 0)
+                        mpage_da_map_and_submit(mpd);
-                                mpage_da_submit_io(mpd);
                        /*
                         * skip rest of the page in the page_vec
                         */
-                        mpd->io_done = 1;
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return MPAGE_DA_EXTENT_TAIL;
@@ -2550,8 +2573,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                if (buffer_delay(bh))
                        return 0; /* Not sure this could or should happen */
                /*
-                 * XXX: __block_prepare_write() unmaps passed block,
+                 * XXX: __block_write_begin() unmaps passed block, is it OK?
-                 * is it OK?
                 */
                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
@@ -2583,7 +2605,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 /*
 * This function is used as a standard get_block_t calback function
 * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write() and block_write_full_page().
+ * callback function for block_write_begin() and block_write_full_page().
 * These functions should only try to map a single block at a time.
 *
 * Since this function doesn't do block allocations even if the caller
@@ -2623,6 +2645,7 @@ static int __ext4_journalled_writepage(struct page *page,
        int ret = 0;
        int err;
+        ClearPageChecked(page);
        page_bufs = page_buffers(page);
        BUG_ON(!page_bufs);
        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2700,7 +2723,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
 {
-        int ret = 0;
+        int ret = 0, commit_write = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
@@ -2713,71 +2736,44 @@ static int ext4_writepage(struct page *page,
        else
                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
+        /*
-                page_bufs = page_buffers(page);
+         * If the page does not have buffers (for whatever reason),
-                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+         * try to create them using __block_write_begin.  If this
-                                        ext4_bh_delay_or_unwritten)) {
+         * fails, redirty the page and move on.
-                        /*
+         */
-                         * We don't want to do  block allocation
+        if (!page_has_buffers(page)) {
-                         * So redirty the page and return
+                if (__block_write_begin(page, 0, len,
-                         * We may reach here when we do a journal commit
+                                        noalloc_get_block_write)) {
-                         * via journal_submit_inode_data_buffers.
+                redirty_page:
-                         * If we don't have mapping block we just ignore
-                         * them. We can also reach here via shrink_page_list
-                         */
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return 0;
                }
-        } else {
+                commit_write = 1;
+        }
+        page_bufs = page_buffers(page);
+        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                              ext4_bh_delay_or_unwritten)) {
                /*
-                 * The test for page_has_buffers() is subtle:
+                 * We don't want to do block allocation, so redirty
-                 * We know the page is dirty but it lost buffers. That means
+                 * the page and return.  We may reach here when we do
-                 * that at some moment in time after write_begin()/write_end()
+                 * a journal commit via journal_submit_inode_data_buffers.
-                 * has been called all buffers have been clean and thus they
+                 * We can also reach here via shrink_page_list
-                 * must have been written at least once. So they are all
-                 * mapped and we can happily proceed with mapping them
-                 * and writing the page.
-                 *
-                 * Try to initialize the buffer_heads and check whether
-                 * all are mapped and non delay. We don't want to
-                 * do block allocation here.
                 */
-                ret = block_prepare_write(page, 0, len,
+                goto redirty_page;
-                                          noalloc_get_block_write);
+        }
-                if (!ret) {
+        if (commit_write)
-                        page_bufs = page_buffers(page);
-                        /* check whether all are mapped and non delay */
-                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                                ext4_bh_delay_or_unwritten)) {
-                                redirty_page_for_writepage(wbc, page);
-                                unlock_page(page);
-                                return 0;
-                        }
-                } else {
-                        /*
-                         * We can't do block allocation here
-                         * so just redity the page and unlock
-                         * and return
-                         */
-                        redirty_page_for_writepage(wbc, page);
-                        unlock_page(page);
-                        return 0;
-                }
                /* now mark the buffer_heads as dirty and uptodate */
                block_commit_write(page, 0, len);
-        }
-        if (PageChecked(page) && ext4_should_journal_data(inode)) {
+        if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
-                ClearPageChecked(page);
                return __ext4_journalled_writepage(page, len);
-        }
-        if (page_bufs && buffer_uninit(page_bufs)) {
+        if (buffer_uninit(page_bufs)) {
                ext4_set_bh_endio(page_bufs, inode);
                ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                            wbc, ext4_end_io_buffer_write);
@@ -2824,25 +2820,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 */
 static int write_cache_pages_da(struct address_space *mapping,
                                struct writeback_control *wbc,
-                                struct mpage_da_data *mpd)
+                                struct mpage_da_data *mpd,
+                                pgoff_t *done_index)
 {
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
-        int nr_pages;
+        unsigned nr_pages;
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        long nr_to_write = wbc->nr_to_write;
+        int tag;
        pagevec_init(&pvec, 0);
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag = PAGECACHE_TAG_TOWRITE;
+        else
+                tag = PAGECACHE_TAG_DIRTY;
+        *done_index = index;
        while (!done && (index <= end)) {
                int i;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-                              PAGECACHE_TAG_DIRTY,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
@@ -2862,6 +2865,8 @@ static int write_cache_pages_da(struct address_space *mapping,
                                break;
                        }
+                        *done_index = page->index + 1;
                        lock_page(page);
                        /*
@@ -2947,6 +2952,8 @@ static int ext4_da_writepages(struct address_space *mapping,
        long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+        pgoff_t done_index = 0;
+        pgoff_t end;
        trace_ext4_da_writepages(inode, wbc);
@@ -2982,8 +2989,11 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->range_start = index << PAGE_CACHE_SHIFT;
                wbc->range_end  = LLONG_MAX;
                wbc->range_cyclic = 0;
-        } else
+                end = -1;
+        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        }
        /*
         * This works around two forms of stupidity.  The first is in
@@ -3002,9 +3012,12 @@ static int ext4_da_writepages(struct address_space *mapping,
         * sbi->max_writeback_mb_bump whichever is smaller.
         */
        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-        if (!range_cyclic && range_whole)
+        if (!range_cyclic && range_whole) {
-                desired_nr_to_write = wbc->nr_to_write * 8;
+                if (wbc->nr_to_write == LONG_MAX)
-        else
+                        desired_nr_to_write = wbc->nr_to_write;
+                else
+                        desired_nr_to_write = wbc->nr_to_write * 8;
+        } else
                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
                                                           max_pages);
        if (desired_nr_to_write > max_pages)
@@ -3021,6 +3034,9 @@ static int ext4_da_writepages(struct address_space *mapping,
        pages_skipped = wbc->pages_skipped;
 retry:
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag_pages_for_writeback(mapping, index, end);
        while (!ret && wbc->nr_to_write > 0) {
                /*
@@ -3059,16 +3075,14 @@ retry:
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-                ret = write_cache_pages_da(mapping, wbc, &mpd);
+                ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                        if (mpage_da_map_blocks(&mpd) == 0)
+                        mpage_da_map_and_submit(&mpd);
-                                mpage_da_submit_io(&mpd);
-                        mpd.io_done = 1;
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
                trace_ext4_da_write_pages(inode, &mpd);
@@ -3115,14 +3129,13 @@ retry:
                         __func__, wbc->nr_to_write, ret);
        /* Update index */
-        index += pages_written;
        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-                mapping->writeback_index = index;
+                mapping->writeback_index = done_index;
 out_writepages:
        wbc->nr_to_write -= nr_to_writebump;
@@ -3367,7 +3380,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * doing I/O at all.
         *
         * We could call write_cache_pages(), and then redirty all of
-         * the pages by calling redirty_page_for_writeback() but that
+         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
         * simplifying them becuase we wouldn't actually intend to
@@ -3457,15 +3470,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-        BUG_ON(!io);
-        if (io->page)
-                put_page(io->page);
-        iput(io->inode);
-        kfree(io);
-}
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
        struct buffer_head *head, *bh;
@@ -3642,173 +3646,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef  EXT4_DEBUG
-        struct list_head *cur, *before, *after;
-        ext4_io_end_t *io, *io0, *io1;
-        unsigned long flags;
-        if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-                ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-                return;
-        }
-        ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-                cur = &io->list;
-                before = cur->prev;
-                io0 = container_of(before, ext4_io_end_t, list);
-                after = cur->next;
-                io1 = container_of(after, ext4_io_end_t, list);
-                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                            io, inode->i_ino, io0, io1);
-        }
-        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-/*
- * check a range of space and convert unwritten extents to written.
- */
-static int ext4_end_io_nolock(ext4_io_end_t *io)
-{
-        struct inode *inode = io->inode;
-        loff_t offset = io->offset;
-        ssize_t size = io->size;
-        int ret = 0;
-        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-                   "list->prev 0x%p\n",
-                   io, inode->i_ino, io->list.next, io->list.prev);
-        if (list_empty(&io->list))
-                return ret;
-        if (io->flag != EXT4_IO_UNWRITTEN)
-                return ret;
-        ret = ext4_convert_unwritten_extents(inode, offset, size);
-        if (ret < 0) {
-                printk(KERN_EMERG "%s: failed to convert unwritten"
-                        "extents to written extents, error is %d"
-                        " io is still on inode %lu aio dio list\n",
-                       __func__, ret, inode->i_ino);
-                return ret;
-        }
-        if (io->iocb)
-                aio_complete(io->iocb, io->result, 0);
-        /* clear the DIO AIO unwritten flag */
-        io->flag = 0;
-        return ret;
-}
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
-{
-        ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-        struct inode            *inode = io->inode;
-        struct ext4_inode_info  *ei = EXT4_I(inode);
-        unsigned long           flags;
-        int                     ret;
-        mutex_lock(&inode->i_mutex);
-        ret = ext4_end_io_nolock(io);
-        if (ret < 0) {
-                mutex_unlock(&inode->i_mutex);
-                return;
-        }
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        if (!list_empty(&io->list))
-                list_del_init(&io->list);
-        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-        mutex_unlock(&inode->i_mutex);
-        ext4_free_io_end(io);
-}
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int flush_completed_IO(struct inode *inode)
-{
-        ext4_io_end_t *io;
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long flags;
-        int ret = 0;
-        int ret2 = 0;
-        if (list_empty(&ei->i_completed_io_list))
-                return ret;
-        dump_completed_IO(inode);
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        while (!list_empty(&ei->i_completed_io_list)){
-                io = list_entry(ei->i_completed_io_list.next,
-                                ext4_io_end_t, list);
-                /*
-                 * Calling ext4_end_io_nolock() to convert completed
-                 * IO to written.
-                 *
-                 * When ext4_sync_file() is called, run_queue() may already
-                 * about to flush the work corresponding to this io structure.
-                 * It will be upset if it founds the io structure related
-                 * to the work-to-be schedule is freed.
-                 *
-                 * Thus we need to keep the io structure still valid here after
-                 * convertion finished. The io structure has a flag to
-                 * avoid double converting from both fsync and background work
-                 * queue work.
-                 */
-                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-                ret = ext4_end_io_nolock(io);
-                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-                if (ret < 0)
-                        ret2 = ret;
-                else
-                        list_del_init(&io->list);
-        }
-        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-        return (ret2 < 0) ? ret2 : 0;
-}
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
-{
-        ext4_io_end_t *io = NULL;
-        io = kmalloc(sizeof(*io), flags);
-        if (io) {
-                igrab(inode);
-                io->inode = inode;
-                io->flag = 0;
-                io->offset = 0;
-                io->size = 0;
-                io->page = NULL;
-                io->iocb = NULL;
-                io->result = 0;
-                INIT_WORK(&io->work, ext4_end_io_work);
-                INIT_LIST_HEAD(&io->list);
-        }
-        return io;
-}
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private, int ret,
                            bool is_async)
@@ -3828,7 +3665,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
        /* if not aio dio with unwritten extents, just free io and return */
-        if (io_end->flag != EXT4_IO_UNWRITTEN){
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
                iocb->private = NULL;
 out:
@@ -3845,14 +3682,14 @@ out:
        }
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-        /* queue the work to convert unwritten extents to written */
-        queue_work(wq, &io_end->work);
        /* Add the io_end to per-inode completed aio dio list*/
        ei = EXT4_I(io_end->inode);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        list_add_tail(&io_end->list, &ei->i_completed_io_list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
        iocb->private = NULL;
 }
@@ -3873,7 +3710,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        }
-        io_end->flag = EXT4_IO_UNWRITTEN;
+        io_end->flag = EXT4_IO_END_UNWRITTEN;
        inode = io_end->inode;
        /* Add the io_end to per-inode completed io list*/
@@ -3901,8 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 retry:
        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
        if (!io_end) {
-                if (printk_ratelimit())
+                pr_warn_ratelimited("%s: allocation fail\n", __func__);
-                        printk(KERN_WARNING "%s: allocation fail\n", __func__);
                schedule();
                goto retry;
        }
@@ -3926,9 +3762,9 @@ retry:
 * preallocated extents, and those write extend the file, no need to
 * fall back to buffered IO.
 *
- * For holes, we fallocate those blocks, mark them as unintialized
+ * For holes, we fallocate those blocks, mark them as uninitialized
 * If those blocks were preallocated, we mark sure they are splited, but
- * still keep the range to write as unintialized.
+ * still keep the range to write as uninitialized.
 *
 * The unwrritten extents will be converted to written when DIO is completed.
 * For async direct IO, since the IO may still pending when return, we
@@ -4226,7 +4062,7 @@ int ext4_block_truncate_page(handle_t *handle,
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
-                if (ext4_should_order_data(inode))
+                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -4350,6 +4186,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 {
        __le32 *p;
        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     err;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4365,11 +4202,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, inode, bh);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(inode->i_sb, err);
+                                return 1;
+                        }
+                }
+                err = ext4_mark_inode_dirty(handle, inode);
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
+                }
+                err = ext4_truncate_restart_trans(handle, inode,
+                                                  blocks_for_truncate(inode));
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
                }
-                ext4_mark_inode_dirty(handle, inode);
-                ext4_truncate_restart_trans(handle, inode,
-                                            blocks_for_truncate(inode));
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
                        ext4_journal_get_write_access(handle, bh);
@@ -4530,6 +4379,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                        (__le32 *) bh->b_data,
                                        (__le32 *) bh->b_data + addr_per_block,
                                        depth);
+                        brelse(bh);
                        /*
                         * Everything below this this pointer has been
@@ -5040,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -5299,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
        if (ext4_inode_blocks_set(handle, raw_inode, ei))
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
@@ -5464,6 +5314,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        int error, rc = 0;
+        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
        error = inode_change_ok(inode, attr);
@@ -5519,8 +5370,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        error = PTR_ERR(handle);
                        goto err_out;
                }
+                if (ext4_handle_valid(handle)) {
-                error = ext4_orphan_add(handle, inode);
+                        error = ext4_orphan_add(handle, inode);
+                        orphan = 1;
+                }
                EXT4_I(inode)->i_disksize = attr->ia_size;
                rc = ext4_mark_inode_dirty(handle, inode);
                if (!error)
@@ -5538,6 +5391,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                        goto err_out;
                                }
                                ext4_orphan_del(handle, inode);
+                                orphan = 0;
                                ext4_journal_stop(handle);
                                goto err_out;
                        }
@@ -5560,7 +5414,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
-        if (inode->i_nlink)
+        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);
        if (!rc && (ia_valid & ATTR_MODE))
@@ -5592,9 +5446,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
        return 0;
@@ -5643,7 +5495,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
-int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
@@ -5831,6 +5683,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        int err, ret;
        might_sleep();
+        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1bd..eb3bc2fe647e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,6 +331,30 @@ mext_out:
                return err;
        }
+        case FITRIM:
+        {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                    sizeof(range)))
+                        return -EFAULT;
+                ret = ext4_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                    sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
        default:
                return -ENOTTY;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..851f49b2f9d2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -338,6 +338,14 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_ext_cachep;
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size.  There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES       \
+        (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -939,6 +947,85 @@ out:
 }
 /*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+                                        ext4_group_t group)
+{
+        int i;
+        int block, pnum;
+        int blocks_per_page;
+        int groups_per_page;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
+        ext4_group_t first_group;
+        struct ext4_group_info *grp;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        first_group = pnum * blocks_per_page / 2;
+        groups_per_page = blocks_per_page >> 1;
+        if (groups_per_page == 0)
+                groups_per_page = 1;
+        /* read all groups the page covers into the cache */
+        for (i = 0; i < groups_per_page; i++) {
+                if ((first_group + i) >= ngroups)
+                        break;
+                grp = ext4_get_group_info(sb, first_group + i);
+                /* take all groups write allocation
+                 * semaphore. This make sure there is
+                 * no block allocation going on in any
+                 * of that groups
+                 */
+                down_write_nested(&grp->alloc_sem, i);
+        }
+        return i;
+}
+static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+                                         ext4_group_t group, int locked_group)
+{
+        int i;
+        int block, pnum;
+        int blocks_per_page;
+        ext4_group_t first_group;
+        struct ext4_group_info *grp;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        first_group = pnum * blocks_per_page / 2;
+        /* release locks on all the groups */
+        for (i = 0; i < locked_group; i++) {
+                grp = ext4_get_group_info(sb, first_group + i);
+                /* take all groups write allocation
+                 * semaphore. This make sure there is
+                 * no block allocation going on in any
+                 * of that groups
+                 */
+                up_write(&grp->alloc_sem);
+        }
+}
+/*
 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
 * block group lock of all groups for this page; do not hold the BG lock when
 * calling this routine!
@@ -1915,84 +2002,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        return 0;
 }
-/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
- */
-int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
-{
-        int i;
-        int block, pnum;
-        int blocks_per_page;
-        int groups_per_page;
-        ext4_group_t ngroups = ext4_get_groups_count(sb);
-        ext4_group_t first_group;
-        struct ext4_group_info *grp;
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
-        groups_per_page = blocks_per_page >> 1;
-        if (groups_per_page == 0)
-                groups_per_page = 1;
-        /* read all groups the page covers into the cache */
-        for (i = 0; i < groups_per_page; i++) {
-                if ((first_group + i) >= ngroups)
-                        break;
-                grp = ext4_get_group_info(sb, first_group + i);
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                down_write_nested(&grp->alloc_sem, i);
-        }
-        return i;
-}
-void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-                                        ext4_group_t group, int locked_group)
-{
-        int i;
-        int block, pnum;
-        int blocks_per_page;
-        ext4_group_t first_group;
-        struct ext4_group_info *grp;
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
-        /* release locks on all the groups */
-        for (i = 0; i < locked_group; i++) {
-                grp = ext4_get_group_info(sb, first_group + i);
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                up_write(&grp->alloc_sem);
-        }
-}
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2233,15 +2242,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
        .release        = seq_release,
 };
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+        BUG_ON(!cachep);
+        return cachep;
+}
 /* Create and initialize ext4_group_info data for the given group. */
 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                          struct ext4_group_desc *desc)
 {
-        int i, len;
+        int i;
        int metalen = 0;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info **meta_group_info;
+        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        /*
         * First check if this group is the first of a reserved block.
@@ -2261,22 +2279,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        meta_group_info;
        }
-        /*
-         * calculate needed size. if change bb_counters size,
-         * don't forget about ext4_mb_generate_buddy()
-         */
-        len = offsetof(typeof(**meta_group_info),
-                       bb_counters[sb->s_blocksize_bits + 2]);
        meta_group_info =
                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-        meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+        meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (meta_group_info[i] == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
                goto exit_group_info;
        }
+        memset(meta_group_info[i], 0, kmem_cache_size(cachep));
        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                &(meta_group_info[i]->bb_state));
@@ -2331,6 +2343,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        int num_meta_group_infos_max;
        int array_size;
        struct ext4_group_desc *desc;
+        struct kmem_cache *cachep;
        /* This is the number of blocks used by GDT */
        num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2373,6 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                printk(KERN_ERR "EXT4-fs: can't get new inode\n");
                goto err_freesgi;
        }
+        sbi->s_buddy_cache->i_ino = get_next_ino();
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
@@ -2388,8 +2402,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
        return 0;
 err_freebuddy:
+        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        while (i-- > 0)
-                kfree(ext4_get_group_info(sb, i));
+                kmem_cache_free(cachep, ext4_get_group_info(sb, i));
        i = num_meta_group_infos;
        while (i-- > 0)
                kfree(sbi->s_group_info[i]);
@@ -2406,19 +2421,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned offset;
        unsigned max;
        int ret;
+        int cache_index;
+        struct kmem_cache *cachep;
+        char *namep = NULL;
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out;
        }
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
-                kfree(sbi->s_mb_offsets);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto out;
+        }
+        cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        cachep = ext4_groupinfo_caches[cache_index];
+        if (!cachep) {
+                char name[32];
+                int len = offsetof(struct ext4_group_info,
+                                        bb_counters[sb->s_blocksize_bits + 2]);
+                sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
+                namep = kstrdup(name, GFP_KERNEL);
+                if (!namep) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                /* Need to free the kmem_cache_name() when we
+                 * destroy the slab */
+                cachep = kmem_cache_create(namep, len, 0,
+                                             SLAB_RECLAIM_ACCOUNT, NULL);
+                if (!cachep) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ext4_groupinfo_caches[cache_index] = cachep;
        }
        /* order 0 is regular bitmap */
@@ -2439,9 +2483,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0) {
-                kfree(sbi->s_mb_offsets);
+                goto out;
-                kfree(sbi->s_mb_maxs);
-                return ret;
        }
        spin_lock_init(&sbi->s_md_lock);
@@ -2456,9 +2498,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
-                kfree(sbi->s_mb_offsets);
+                ret = -ENOMEM;
-                kfree(sbi->s_mb_maxs);
+                goto out;
-                return -ENOMEM;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
@@ -2475,7 +2516,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-        return 0;
+out:
+        if (ret) {
+                kfree(sbi->s_mb_offsets);
+                kfree(sbi->s_mb_maxs);
+                kfree(namep);
+        }
+        return ret;
 }
 /* need to called with the ext4 group lock held */
@@ -2503,6 +2550,7 @@ int ext4_mb_release(struct super_block *sb)
        int num_meta_group_infos;
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        if (sbi->s_group_info) {
                for (i = 0; i < ngroups; i++) {
@@ -2513,7 +2561,7 @@ int ext4_mb_release(struct super_block *sb)
                        ext4_lock_group(sb, i);
                        ext4_mb_cleanup_pa(grinfo);
                        ext4_unlock_group(sb, i);
-                        kfree(grinfo);
+                        kmem_cache_free(cachep, grinfo);
                }
                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2557,20 +2605,15 @@ int ext4_mb_release(struct super_block *sb)
        return 0;
 }
-static inline void ext4_issue_discard(struct super_block *sb,
+static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
-        int ret;
        ext4_fsblk_t discard_block;
        discard_block = block + ext4_group_first_block_no(sb, block_group);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
-        ret = sb_issue_discard(sb, discard_block, count);
+        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-        if (ret == EOPNOTSUPP) {
-                ext4_warning(sb, "discard not supported, disabling");
-                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-        }
 }
 /*
@@ -2582,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct super_block *sb = journal->j_private;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
-        int err, count = 0, count2 = 0;
+        int err, ret, count = 0, count2 = 0;
        struct ext4_free_data *entry;
        struct list_head *l, *ltmp;
@@ -2592,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
-                if (test_opt(sb, DISCARD))
+                if (test_opt(sb, DISCARD)) {
-                        ext4_issue_discard(sb, entry->group,
+                        ret = ext4_issue_discard(sb, entry->group,
                                        entry->start_blk, entry->count);
+                        if (unlikely(ret == -EOPNOTSUPP)) {
+                                ext4_warning(sb, "discard not supported, "
+                                                 "disabling");
+                                clear_opt(sb, DISCARD);
+                        }
+                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -2658,28 +2707,22 @@ static void ext4_remove_debugfs_entry(void)
 #endif
-int __init init_ext4_mballoc(void)
+int __init ext4_init_mballoc(void)
 {
-        ext4_pspace_cachep =
+        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
-                kmem_cache_create("ext4_prealloc_space",
+                                        SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_prealloc_space),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_pspace_cachep == NULL)
                return -ENOMEM;
-        ext4_ac_cachep =
+        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
-                kmem_cache_create("ext4_alloc_context",
+                                    SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_allocation_context),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_ac_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                return -ENOMEM;
        }
-        ext4_free_ext_cachep =
+        ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
-                kmem_cache_create("ext4_free_block_extents",
+                                          SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_free_data),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_free_ext_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                kmem_cache_destroy(ext4_ac_cachep);
@@ -2689,8 +2732,9 @@ int __init init_ext4_mballoc(void)
        return 0;
 }
-void exit_ext4_mballoc(void)
+void ext4_exit_mballoc(void)
 {
+        int i;
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
@@ -2699,6 +2743,15 @@ void exit_ext4_mballoc(void)
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
+        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+                struct kmem_cache *cachep = ext4_groupinfo_caches[i];
+                if (cachep) {
+                        char *name = (char *)kmem_cache_name(cachep);
+                        kmem_cache_destroy(cachep);
+                        kfree(name);
+                }
+        }
        ext4_remove_debugfs_entry();
 }
@@ -3535,8 +3588,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
 */
 static noinline_for_stack int
 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
-                        struct ext4_prealloc_space *pa,
+                        struct ext4_prealloc_space *pa)
-                        struct ext4_allocation_context *ac)
 {
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3554,11 +3606,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = pa->pa_inode;
-        }
        while (bit < end) {
                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                if (bit >= end)
@@ -3569,16 +3616,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;
-                if (ac) {
+                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
-                        ac->ac_b_ex.fe_group = group;
+                trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
-                        ac->ac_b_ex.fe_start = bit;
+                                               grp_blk_start + bit, next - bit);
-                        ac->ac_b_ex.fe_len = next - bit;
-                        ac->ac_b_ex.fe_logical = 0;
-                        trace_ext4_mballoc_discard(ac);
-                }
-                trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
-                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
@@ -3601,29 +3641,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 static noinline_for_stack int
 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
-                                struct ext4_prealloc_space *pa,
+                                struct ext4_prealloc_space *pa)
-                                struct ext4_allocation_context *ac)
 {
        struct super_block *sb = e4b->bd_sb;
        ext4_group_t group;
        ext4_grpblk_t bit;
-        trace_ext4_mb_release_group_pa(sb, ac, pa);
+        trace_ext4_mb_release_group_pa(sb, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = NULL;
-                ac->ac_b_ex.fe_group = group;
-                ac->ac_b_ex.fe_start = bit;
-                ac->ac_b_ex.fe_len = pa->pa_len;
-                ac->ac_b_ex.fe_logical = 0;
-                trace_ext4_mballoc_discard(ac);
-        }
        return 0;
 }
@@ -3644,7 +3674,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        struct list_head list;
        struct ext4_buddy e4b;
        int err;
@@ -3673,9 +3702,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
                needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
        INIT_LIST_HEAD(&list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac)
-                ac->ac_sb = sb;
 repeat:
        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
@@ -3730,9 +3756,9 @@ repeat:
                spin_unlock(pa->pa_obj_lock);
                if (pa->pa_type == MB_GROUP_PA)
-                        ext4_mb_release_group_pa(&e4b, pa, ac);
+                        ext4_mb_release_group_pa(&e4b, pa);
                else
-                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3740,8 +3766,6 @@ repeat:
 out:
        ext4_unlock_group(sb, group);
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
        return free;
@@ -3762,7 +3786,6 @@ void ext4_discard_preallocations(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        ext4_group_t group = 0;
        struct list_head list;
        struct ext4_buddy e4b;
@@ -3778,11 +3801,6 @@ void ext4_discard_preallocations(struct inode *inode)
        INIT_LIST_HEAD(&list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = inode;
-        }
 repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
@@ -3852,7 +3870,7 @@ repeat:
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                ext4_unlock_group(sb, group);
                ext4_mb_unload_buddy(&e4b);
@@ -3861,23 +3879,8 @@ repeat:
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
 }
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-                                        struct ext4_buddy *e4b,
-                                        sector_t block, int count)
-{
-        BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
 #ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
@@ -4060,14 +4063,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
        struct ext4_buddy e4b;
        struct list_head discard_list;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        mb_debug(1, "discard locality group preallocation\n");
        INIT_LIST_HEAD(&discard_list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac)
-                ac->ac_sb = sb;
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4119,15 +4118,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                }
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-                ext4_mb_release_group_pa(&e4b, pa, ac);
+                ext4_mb_release_group_pa(&e4b, pa);
                ext4_unlock_group(sb, group);
                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
 }
 /*
@@ -4273,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
         * EDQUOT check, as blocks and quotas have been already
         * reserved when data being copied into pagecache.
         */
-        if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+        if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
        else {
                /* Without delayed allocation we need to verify
@@ -4370,7 +4367,8 @@ out:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, inquota - ar->len);
        if (!ar->len) {
-                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                if (!ext4_test_inode_state(ar->inode,
+                                           EXT4_STATE_DELALLOC_RESERVED))
                        /* release all the reserved blocks if non delalloc */
                        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
                                                reserv_blks);
@@ -4491,7 +4489,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 {
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
-        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        unsigned long freed = 0;
        unsigned int overflow;
@@ -4531,6 +4528,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
+                        if (unlikely(!tbh))
+                                continue;
                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
                }
@@ -4546,12 +4545,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_should_writeback_data(inode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac) {
-                ac->ac_inode = inode;
-                ac->ac_sb = sb;
-        }
 do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4609,12 +4602,7 @@ do_more:
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
 #endif
-        if (ac) {
+        trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
-                ac->ac_b_ex.fe_group = block_group;
-                ac->ac_b_ex.fe_start = bit;
-                ac->ac_b_ex.fe_len = count;
-                trace_ext4_mballoc_free(ac);
-        }
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
@@ -4626,7 +4614,11 @@ do_more:
                 * blocks being freed are metadata. these blocks shouldn't
                 * be used until this transaction is committed
                 */
-                new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                if (!new_entry) {
+                        err = -ENOMEM;
+                        goto error_return;
+                }
                new_entry->start_blk = bit;
                new_entry->group  = block_group;
                new_entry->count = count;
@@ -4643,9 +4635,6 @@ do_more:
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
-                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-                if (test_opt(sb, DISCARD))
-                        ext4_issue_discard(sb, block_group, bit, count);
        }
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4685,7 +4674,194 @@ error_return:
                dquot_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
        return;
 }
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:         super block for the file system
+ * @start:      starting block of the free extent in the alloc. group
+ * @count:      number of blocks to TRIM
+ * @group:      alloc. group we are working with
+ * @e4b:        ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+                ext4_group_t group, struct ext4_buddy *e4b)
+{
+        struct ext4_free_extent ex;
+        int ret = 0;
+        assert_spin_locked(ext4_group_lock_ptr(sb, group));
+        ex.fe_start = start;
+        ex.fe_group = group;
+        ex.fe_len = count;
+        /*
+         * Mark blocks used, so no one can reuse them while
+         * being trimmed.
+         */
+        mb_mark_used(e4b, &ex);
+        ext4_unlock_group(sb, group);
+        ret = ext4_issue_discard(sb, group, start, count);
+        ext4_lock_group(sb, group);
+        mb_free_blocks(NULL, e4b, start, ex.fe_len);
+        return ret;
+}
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:                 super block for file system
+ * @e4b:                ext4 buddy
+ * @start:              first group block to examine
+ * @max:                last group block to examine
+ * @minblocks:          minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+                ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+        void *bitmap;
+        ext4_grpblk_t next, count = 0;
+        ext4_group_t group;
+        int ret = 0;
+        BUG_ON(e4b == NULL);
+        bitmap = e4b->bd_bitmap;
+        group = e4b->bd_group;
+        start = (e4b->bd_info->bb_first_free > start) ?
+                e4b->bd_info->bb_first_free : start;
+        ext4_lock_group(sb, group);
+        while (start < max) {
+                start = mb_find_next_zero_bit(bitmap, max, start);
+                if (start >= max)
+                        break;
+                next = mb_find_next_bit(bitmap, max, start);
+                if ((next - start) >= minblocks) {
+                        ret = ext4_trim_extent(sb, start,
+                                next - start, group, e4b);
+                        if (ret < 0)
+                                break;
+                        count += next - start;
+                }
+                start = next + 1;
+                if (fatal_signal_pending(current)) {
+                        count = -ERESTARTSYS;
+                        break;
+                }
+                if (need_resched()) {
+                        ext4_unlock_group(sb, group);
+                        cond_resched();
+                        ext4_lock_group(sb, group);
+                }
+                if ((e4b->bd_info->bb_free - count) < minblocks)
+                        break;
+        }
+        ext4_unlock_group(sb, group);
+        ext4_debug("trimmed %d blocks in the group %d\n",
+                count, group);
+        if (ret < 0)
+                count = ret;
+        return count;
+}
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:                 superblock for filesystem
+ * @range:              fstrim_range structure
+ *
+ * start:       First Byte to trim
+ * len:         number of Bytes to trim from start
+ * minlen:      minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        struct ext4_buddy e4b;
+        ext4_group_t first_group, last_group;
+        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+        ext4_grpblk_t cnt = 0, first_block, last_block;
+        uint64_t start, len, minlen, trimmed;
+        ext4_fsblk_t first_data_blk =
+                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+        int ret = 0;
+        start = range->start >> sb->s_blocksize_bits;
+        len = range->len >> sb->s_blocksize_bits;
+        minlen = range->minlen >> sb->s_blocksize_bits;
+        trimmed = 0;
+        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+                return -EINVAL;
+        if (start < first_data_blk) {
+                len -= first_data_blk - start;
+                start = first_data_blk;
+        }
+        /* Determine first and last group to examine based on start and len */
+        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+                                     &first_group, &first_block);
+        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+                                     &last_group, &last_block);
+        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+        last_block = EXT4_BLOCKS_PER_GROUP(sb);
+        if (first_group > last_group)
+                return -EINVAL;
+        for (group = first_group; group <= last_group; group++) {
+                ret = ext4_mb_load_buddy(sb, group, &e4b);
+                if (ret) {
+                        ext4_error(sb, "Error in loading buddy "
+                                        "information for %u", group);
+                        break;
+                }
+                if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+                        len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+                else
+                        last_block = first_block + len;
+                if (e4b.bd_info->bb_free >= minlen) {
+                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
+                                                last_block, minlen);
+                        if (cnt < 0) {
+                                ret = cnt;
+                                ext4_mb_unload_buddy(&e4b);
+                                break;
+                        }
+                }
+                ext4_mb_unload_buddy(&e4b);
+                trimmed += cnt;
+                first_block = 0;
+        }
+        range->len = trimmed * sb->s_blocksize;
+        return ret;
+}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 1765c2c50a9b..b0a126f23c20 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        struct buffer_head *bh;
        struct ext4_extent_header *eh;
-        block = idx_pblock(ix);
+        block = ext4_idx_pblock(ix);
        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                return -EIO;
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
        goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
                EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-                                   S_IFREG, 0, goal);
+                                   S_IFREG, NULL, goal);
        if (IS_ERR(tmp_inode)) {
                retval = -ENOMEM;
                ext4_journal_stop(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5f1ed9fc913c..b9f3e7862f13 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
                /* leaf block */
                *extent = ++path[ppos].p_ext;
-                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
                return 0;
        }
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                        /* index block */
                        path[ppos].p_idx++;
-                        path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                        path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                        if (path[ppos+1].p_bh)
                                brelse(path[ppos+1].p_bh);
                        path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                                path[cur_ppos].p_idx =
                                        EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
                                path[cur_ppos].p_block =
-                                        idx_pblock(path[cur_ppos].p_idx);
+                                        ext4_idx_pblock(path[cur_ppos].p_idx);
                                if (path[cur_ppos+1].p_bh)
                                        brelse(path[cur_ppos+1].p_bh);
                                path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                        path[leaf_ppos].p_ext = *extent =
                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
                        path[leaf_ppos].p_block =
-                                        ext_pblock(path[leaf_ppos].p_ext);
+                                        ext4_ext_pblock(path[leaf_ppos].p_ext);
                        return 0;
                }
        }
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                         */
                        o_end->ee_block = end_ext->ee_block;
                        o_end->ee_len = end_ext->ee_len;
-                        ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                        ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
                }
                o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                 */
                o_end->ee_block = end_ext->ee_block;
                o_end->ee_len = end_ext->ee_len;
-                ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
                /*
                 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
        /* Insert new entry */
        if (new_ext->ee_len) {
                o_start[i] = *new_ext;
-                ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+                ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
        }
        /* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        start_ext.ee_len = end_ext.ee_len = 0;
        new_ext.ee_block = cpu_to_le32(*from);
-        ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+        ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
        new_ext.ee_len = dext->ee_len;
        new_ext_alen = ext4_ext_get_actual_len(&new_ext);
        new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
                copy_extent_status(oext, &end_ext);
                end_ext_alen = ext4_ext_get_actual_len(&end_ext);
                ext4_ext_store_pblock(&end_ext,
-                        (ext_pblock(o_end) + oext_alen - end_ext_alen));
+                        (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
                end_ext.ee_block =
                        cpu_to_le32(le32_to_cpu(o_end->ee_block) +
                        oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
        /* When tmp_dext is too large, pick up the target range. */
        diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
-        ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+        ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
        tmp_dext->ee_block =
                        cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
        tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
                tmp_dext->ee_len = cpu_to_le16(max_count);
        orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-        ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+        ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
        /* Adjust extent length if donor extent is larger than orig */
        if (ext4_ext_get_actual_len(tmp_dext) >
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..5485390d32c5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
-                if (!ext4_check_dir_entry(dir, de, bh,
+                if (ext4_check_dir_entry(dir, NULL, de, bh,
-                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
-                                                +((char *)de - bh->b_data))) {
+                                         + ((char *)de - bh->b_data))) {
                        /* On error, skip the f_pos to the next block. */
                        dir_file->f_pos = (dir_file->f_pos |
                                        (dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
                if ((char *) de + namelen <= dlimit &&
                    ext4_match (namelen, name, de)) {
                        /* found a match - just to be sure, do a full check */
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
@@ -856,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
        struct buffer_head *bh, *ret = NULL;
        ext4_lblk_t start, block, b;
+        const u8 *name = d_name->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
@@ -870,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        namelen = d_name->len;
        if (namelen > EXT4_NAME_LEN)
                return NULL;
+        if ((namelen <= 2) && (name[0] == '.') &&
+            (name[1] == '.' || name[1] == '\0')) {
+                /*
+                 * "." or ".." will only be in the first block
+                 * NFS may look up ".."; "." should be handled by the VFS
+                 */
+                block = start = 0;
+                nblocks = 1;
+                goto restart;
+        }
        if (is_dx(dir)) {
                bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
                /*
@@ -960,55 +971,35 @@ cleanup_and_exit:
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
                       struct ext4_dir_entry_2 **res_dir, int *err)
 {
-        struct super_block * sb;
+        struct super_block * sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-        u32 hash;
        struct dx_frame frames[2], *frame;
-        struct ext4_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        ext4_lblk_t block;
        int retval;
-        int namelen = d_name->len;
-        const u8 *name = d_name->name;
-        sb = dir->i_sb;
+        if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-        /* NFS may look up ".." - look at dx_root directory block */
+                return NULL;
-        if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-                if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-                        return NULL;
-        } else {
-                frame = frames;
-                frame->bh = NULL;                       /* for dx_release() */
-                frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-                dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-        }
-        hash = hinfo.hash;
        do {
                block = dx_get_block(frame->at);
-                if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+                if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
                        goto errout;
-                de = (struct ext4_dir_entry_2 *) bh->b_data;
-                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                       EXT4_DIR_REC_LEN(0));
-                for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
-                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-                                  + ((char *) de - bh->b_data);
-                        if (!ext4_check_dir_entry(dir, de, bh, off)) {
-                                brelse(bh);
-                                *err = ERR_BAD_DX_DIR;
-                                goto errout;
-                        }
-                        if (ext4_match(namelen, name, de)) {
+                retval = search_dirblock(bh, dir, d_name,
-                                *res_dir = de;
+                                         block << EXT4_BLOCK_SIZE_BITS(sb),
-                                dx_release(frames);
+                                         res_dir);
-                                return bh;
+                if (retval == 1) {      /* Success! */
-                        }
+                        dx_release(frames);
+                        return bh;
                }
                brelse(bh);
+                if (retval == -1) {
+                        *err = ERR_BAD_DX_DIR;
+                        goto errout;
+                }
                /* Check to see if we should continue to search */
-                retval = ext4_htree_next_block(dir, hash, frame,
+                retval = ext4_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext4_warning(sb,
@@ -1045,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                EXT4_ERROR_INODE(dir,
                                                 "deleted inode referenced: %u",
@@ -1278,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *)bh->b_data;
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -EIO;
                        if (ext4_match(namelen, name, de))
                                return -EEXIST;
@@ -1611,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                if (err) {
+                        ext4_std_error(inode->i_sb, err);
+                        goto cleanup;
+                }
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1639,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle,
 {
        struct ext4_dir_entry_2 *de, *pde;
        unsigned int blocksize = dir->i_sb->s_blocksize;
-        int i;
+        int i, err;
        i = 0;
        pde = NULL;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        while (i < bh->b_size) {
-                if (!ext4_check_dir_entry(dir, de, bh, i))
+                if (ext4_check_dir_entry(dir, NULL, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext4_journal_get_write_access(handle, bh);
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
                                        ext4_rec_len_from_disk(pde->rec_len,
@@ -1661,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, dir, bh);
+                        err = ext4_handle_dirty_metadata(handle, dir, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1798,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        handle_t *handle;
        struct inode *inode;
-        struct buffer_head *dir_block;
+        struct buffer_head *dir_block = NULL;
        struct ext4_dir_entry_2 *de;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int err, retries = 0;
@@ -1831,7 +1834,9 @@ retry:
        if (!dir_block)
                goto out_clear_inode;
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext4_journal_get_write_access(handle, dir_block);
+        err = ext4_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1848,10 +1853,12 @@ retry:
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-        ext4_handle_dirty_metadata(handle, dir, dir_block);
+        err = ext4_handle_dirty_metadata(handle, dir, dir_block);
-        brelse(dir_block);
+        if (err)
-        ext4_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext4_add_entry(handle, dentry, inode);
+        err = ext4_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext4_add_entry(handle, dentry, inode);
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
@@ -1862,10 +1869,13 @@ out_clear_inode:
        }
        ext4_inc_count(handle, dir);
        ext4_update_dx_flag(dir);
-        ext4_mark_inode_dirty(handle, dir);
+        err = ext4_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -1928,7 +1938,7 @@ static int empty_dir(struct inode *inode)
                        }
                        de = (struct ext4_dir_entry_2 *) bh->b_data;
                }
-                if (!ext4_check_dir_entry(inode, de, bh, offset)) {
+                if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
                        de = (struct ext4_dir_entry_2 *)(bh->b_data +
                                                         sb->s_blocksize);
                        offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2312,7 +2322,7 @@ retry:
        inode->i_ctime = ext4_current_time(inode);
        ext4_inc_count(handle, inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
@@ -2416,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                if (unlikely(retval)) {
+                        ext4_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2468,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                if (retval) {
+                        ext4_std_error(old_dir->i_sb, retval);
+                        goto end_rename;
+                }
                ext4_dec_count(handle, old_dir);
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 000000000000..7270dcfca92a
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,428 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ext4_extents.h"
+static struct kmem_cache *io_page_cachep, *io_end_cachep;
+#define WQ_HASH_SZ              37
+#define to_ioend_wq(v)  (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
+static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
+int __init ext4_init_pageio(void)
+{
+        int i;
+        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
+        if (io_page_cachep == NULL)
+                return -ENOMEM;
+        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+        if (io_end_cachep == NULL) {
+                kmem_cache_destroy(io_page_cachep);
+                return -ENOMEM;
+        }
+        for (i = 0; i < WQ_HASH_SZ; i++)
+                init_waitqueue_head(&ioend_wq[i]);
+        return 0;
+}
+void ext4_exit_pageio(void)
+{
+        kmem_cache_destroy(io_end_cachep);
+        kmem_cache_destroy(io_page_cachep);
+}
+void ext4_ioend_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = to_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+static void put_io_page(struct ext4_io_page *io_page)
+{
+        if (atomic_dec_and_test(&io_page->p_count)) {
+                end_page_writeback(io_page->p_page);
+                put_page(io_page->p_page);
+                kmem_cache_free(io_page_cachep, io_page);
+        }
+}
+void ext4_free_io_end(ext4_io_end_t *io)
+{
+        int i;
+        wait_queue_head_t *wq;
+        BUG_ON(!io);
+        if (io->page)
+                put_page(io->page);
+        for (i = 0; i < io->num_io_pages; i++)
+                put_io_page(io->pages[i]);
+        io->num_io_pages = 0;
+        wq = to_ioend_wq(io->inode);
+        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+            waitqueue_active(wq))
+                wake_up_all(wq);
+        kmem_cache_free(io_end_cachep, io);
+}
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+int ext4_end_io_nolock(ext4_io_end_t *io)
+{
+        struct inode *inode = io->inode;
+        loff_t offset = io->offset;
+        ssize_t size = io->size;
+        int ret = 0;
+        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+                   "list->prev 0x%p\n",
+                   io, inode->i_ino, io->list.next, io->list.prev);
+        if (list_empty(&io->list))
+                return ret;
+        if (!(io->flag & EXT4_IO_END_UNWRITTEN))
+                return ret;
+        ret = ext4_convert_unwritten_extents(inode, offset, size);
+        if (ret < 0) {
+                printk(KERN_EMERG "%s: failed to convert unwritten "
+                        "extents to written extents, error is %d "
+                        "io is still on inode %lu aio dio list\n",
+                       __func__, ret, inode->i_ino);
+                return ret;
+        }
+        if (io->iocb)
+                aio_complete(io->iocb, io->result, 0);
+        /* clear the DIO AIO unwritten flag */
+        io->flag &= ~EXT4_IO_END_UNWRITTEN;
+        return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+        ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
+        struct inode            *inode = io->inode;
+        struct ext4_inode_info  *ei = EXT4_I(inode);
+        unsigned long           flags;
+        int                     ret;
+        mutex_lock(&inode->i_mutex);
+        ret = ext4_end_io_nolock(io);
+        if (ret < 0) {
+                mutex_unlock(&inode->i_mutex);
+                return;
+        }
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        if (!list_empty(&io->list))
+                list_del_init(&io->list);
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        mutex_unlock(&inode->i_mutex);
+        ext4_free_io_end(io);
+}
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+        ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
+        if (io) {
+                atomic_inc(&EXT4_I(inode)->i_ioend_count);
+                io->inode = inode;
+                INIT_WORK(&io->work, ext4_end_io_work);
+                INIT_LIST_HEAD(&io->list);
+        }
+        return io;
+}
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+        char b[BDEVNAME_SIZE];
+        printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+                        bdevname(bh->b_bdev, b),
+                        (unsigned long long)bh->b_blocknr);
+}
+static void ext4_end_bio(struct bio *bio, int error)
+{
+        ext4_io_end_t *io_end = bio->bi_private;
+        struct workqueue_struct *wq;
+        struct inode *inode;
+        unsigned long flags;
+        int i;
+        BUG_ON(!io_end);
+        bio->bi_private = NULL;
+        bio->bi_end_io = NULL;
+        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+                error = 0;
+        bio_put(bio);
+        for (i = 0; i < io_end->num_io_pages; i++) {
+                struct page *page = io_end->pages[i]->p_page;
+                struct buffer_head *bh, *head;
+                int partial_write = 0;
+                head = page_buffers(page);
+                if (error)
+                        SetPageError(page);
+                BUG_ON(!head);
+                if (head->b_size == PAGE_CACHE_SIZE)
+                        clear_buffer_dirty(head);
+                else {
+                        loff_t offset;
+                        loff_t io_end_offset = io_end->offset + io_end->size;
+                        offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
+                        bh = head;
+                        do {
+                                if ((offset >= io_end->offset) &&
+                                    (offset+bh->b_size <= io_end_offset)) {
+                                        if (error)
+                                                buffer_io_error(bh);
+                                        clear_buffer_dirty(bh);
+                                }
+                                if (buffer_delay(bh))
+                                        partial_write = 1;
+                                else if (!buffer_mapped(bh))
+                                        clear_buffer_dirty(bh);
+                                else if (buffer_dirty(bh))
+                                        partial_write = 1;
+                                offset += bh->b_size;
+                                bh = bh->b_this_page;
+                        } while (bh != head);
+                }
+                /*
+                 * If this is a partial write which happened to make
+                 * all buffers uptodate then we can optimize away a
+                 * bogus readpage() for the next read(). Here we
+                 * 'discover' whether the page went uptodate as a
+                 * result of this (potentially partial) write.
+                 */
+                if (!partial_write)
+                        SetPageUptodate(page);
+                put_io_page(io_end->pages[i]);
+        }
+        io_end->num_io_pages = 0;
+        inode = io_end->inode;
+        if (error) {
+                io_end->flag |= EXT4_IO_END_ERROR;
+                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+                             "(offset %llu size %ld starting block %llu)",
+                             inode->i_ino,
+                             (unsigned long long) io_end->offset,
+                             (long) io_end->size,
+                             (unsigned long long)
+                             bio->bi_sector >> (inode->i_blkbits - 9));
+        }
+        /* Add the io_end to per-inode completed io list*/
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+        wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
+}
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+        struct bio *bio = io->io_bio;
+        if (bio) {
+                bio_get(io->io_bio);
+                submit_bio(io->io_op, io->io_bio);
+                BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+                bio_put(io->io_bio);
+        }
+        io->io_bio = 0;
+        io->io_op = 0;
+        io->io_end = 0;
+}
+static int io_submit_init(struct ext4_io_submit *io,
+                          struct inode *inode,
+                          struct writeback_control *wbc,
+                          struct buffer_head *bh)
+{
+        ext4_io_end_t *io_end;
+        struct page *page = bh->b_page;
+        int nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio *bio;
+        io_end = ext4_init_io_end(inode, GFP_NOFS);
+        if (!io_end)
+                return -ENOMEM;
+        do {
+                bio = bio_alloc(GFP_NOIO, nvecs);
+                nvecs >>= 1;
+        } while (bio == NULL);
+        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+        bio->bi_bdev = bh->b_bdev;
+        bio->bi_private = io->io_end = io_end;
+        bio->bi_end_io = ext4_end_bio;
+        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+        io->io_bio = bio;
+        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
+                        WRITE_SYNC_PLUG : WRITE);
+        io->io_next_block = bh->b_blocknr;
+        return 0;
+}
+static int io_submit_add_bh(struct ext4_io_submit *io,
+                            struct ext4_io_page *io_page,
+                            struct inode *inode,
+                            struct writeback_control *wbc,
+                            struct buffer_head *bh)
+{
+        ext4_io_end_t *io_end;
+        int ret;
+        if (buffer_new(bh)) {
+                clear_buffer_new(bh);
+                unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+        }
+        if (!buffer_mapped(bh) || buffer_delay(bh)) {
+                if (!buffer_mapped(bh))
+                        clear_buffer_dirty(bh);
+                if (io->io_bio)
+                        ext4_io_submit(io);
+                return 0;
+        }
+        if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+                ext4_io_submit(io);
+        }
+        if (io->io_bio == NULL) {
+                ret = io_submit_init(io, inode, wbc, bh);
+                if (ret)
+                        return ret;
+        }
+        io_end = io->io_end;
+        if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
+            (io_end->pages[io_end->num_io_pages-1] != io_page))
+                goto submit_and_retry;
+        if (buffer_uninit(bh))
+                io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+        io->io_end->size += bh->b_size;
+        io->io_next_block++;
+        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+        if (ret != bh->b_size)
+                goto submit_and_retry;
+        if ((io_end->num_io_pages == 0) ||
+            (io_end->pages[io_end->num_io_pages-1] != io_page)) {
+                io_end->pages[io_end->num_io_pages++] = io_page;
+                atomic_inc(&io_page->p_count);
+        }
+        return 0;
+}
+int ext4_bio_write_page(struct ext4_io_submit *io,
+                        struct page *page,
+                        int len,
+                        struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        unsigned block_start, block_end, blocksize;
+        struct ext4_io_page *io_page;
+        struct buffer_head *bh, *head;
+        int ret = 0;
+        blocksize = 1 << inode->i_blkbits;
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        ClearPageError(page);
+        io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
+        if (!io_page) {
+                set_page_dirty(page);
+                unlock_page(page);
+                return -ENOMEM;
+        }
+        io_page->p_page = page;
+        atomic_set(&io_page->p_count, 1);
+        get_page(page);
+        for (bh = head = page_buffers(page), block_start = 0;
+             bh != head || !block_start;
+             block_start = block_end, bh = bh->b_this_page) {
+                block_end = block_start + blocksize;
+                if (block_start >= len) {
+                        clear_buffer_dirty(bh);
+                        set_buffer_uptodate(bh);
+                        continue;
+                }
+                ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+                if (ret) {
+                        /*
+                         * We only get here on ENOMEM.  Not much else
+                         * we can do but mark the page as dirty, and
+                         * better luck next time.
+                         */
+                        set_page_dirty(page);
+                        break;
+                }
+        }
+        unlock_page(page);
+        /*
+         * If the page was truncated before we could do the writeback,
+         * or we had a memory allocation error while trying to write
+         * the first buffer head, we won't have submitted any pages for
+         * I/O.  In that case we need to make sure we've cleared the
+         * PageWriteback bit from the page to prevent the system from
+         * wedging later on.
+         */
+        put_io_page(io_page);
+        return ret;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca5c8aa00a2f..3ecc6e45d2f9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,29 +220,25 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext4_handle_dirty_metadata(handle, NULL, gdb);
+                err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+                if (unlikely(err)) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
        /* Zero out all of the reserved backup group descriptor table blocks */
-        for (i = 0, bit = gdblocks + 1, block = start + bit;
+        ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
-             i < reserved_gdb; i++, block++, bit++) {
+                        block, sbi->s_itb_per_group);
-                struct buffer_head *gdb;
+        err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
+                               GFP_NOFS);
-                ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
+        if (err)
+                goto exit_bh;
-                if ((err = extend_or_restart_transaction(handle, 1, bh)))
+        for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
-                        goto exit_bh;
-                if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                        err = PTR_ERR(gdb);
-                        goto exit_bh;
-                }
-                ext4_handle_dirty_metadata(handle, NULL, gdb);
                ext4_set_bit(bit, bh->b_data);
-                brelse(gdb);
-        }
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
        ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,29 +247,26 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_set_bit(input->inode_bitmap - start, bh->b_data);
        /* Zero out all of the inode table blocks */
-        for (i = 0, block = input->inode_table, bit = block - start;
+        block = input->inode_table;
-             i < sbi->s_itb_per_group; i++, bit++, block++) {
+        ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
-                struct buffer_head *it;
+                        block, sbi->s_itb_per_group);
+        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
-                ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
+        if (err)
+                goto exit_bh;
-                if ((err = extend_or_restart_transaction(handle, 1, bh)))
+        for (i = 0, bit = input->inode_table - start;
-                        goto exit_bh;
+             i < sbi->s_itb_per_group; i++, bit++)
-                if (IS_ERR(it = bclean(handle, sb, block))) {
-                        err = PTR_ERR(it);
-                        goto exit_bh;
-                }
-                ext4_handle_dirty_metadata(handle, NULL, it);
-                brelse(it);
                ext4_set_bit(bit, bh->b_data);
-        }
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
-        mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+        ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+                             bh->b_data);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_bh;
+        }
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -283,9 +276,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                goto exit_journal;
        }
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-                        bh->b_data);
+                             bh->b_data);
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err))
+                ext4_std_error(sb, err);
 exit_bh:
        brelse(bh);
@@ -437,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                goto exit_dind;
        }
-        if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+        if (unlikely(err))
                goto exit_dind;
-        if ((err = ext4_journal_get_write_access(handle, *primary)))
+        err = ext4_journal_get_write_access(handle, *primary);
+        if (unlikely(err))
                goto exit_sbh;
-        if ((err = ext4_journal_get_write_access(handle, dind)))
+        err = ext4_journal_get_write_access(handle, dind);
-                goto exit_primary;
+        if (unlikely(err))
+                ext4_std_error(sb, err);
        /* ext4_reserve_inode_write() gets a reference on the iloc */
-        if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+        err = ext4_reserve_inode_write(handle, inode, &iloc);
+        if (unlikely(err))
                goto exit_dindj;
        n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -469,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-        ext4_handle_dirty_metadata(handle, NULL, dind);
+        err = ext4_handle_dirty_metadata(handle, NULL, dind);
-        brelse(dind);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
        ext4_mark_iloc_dirty(handle, inode, &iloc);
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext4_handle_dirty_metadata(handle, NULL, *primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, *primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
+        brelse(dind);
        o_group_desc = EXT4_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -485,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        if (err)
+                ext4_std_error(sb, err);
-        return 0;
+        return err;
 exit_inode:
        /* ext4_journal_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
 exit_dindj:
        /* ext4_journal_release_buffer(handle, dind); */
-exit_primary:
-        /* ext4_journal_release_buffer(handle, *primary); */
 exit_sbh:
-        /* ext4_journal_release_buffer(handle, *primary); */
+        /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
        brelse(dind);
 exit_bh:
@@ -680,7 +687,9 @@ static void update_backups(struct super_block *sb,
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext4_handle_dirty_metadata(handle, NULL, bh);
+                err = ext4_handle_dirty_metadata(handle, NULL, bh);
+                if (unlikely(err))
+                        ext4_std_error(sb, err);
                brelse(bh);
        }
        if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -898,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext4_handle_dirty_metadata(handle, NULL, primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_journal;
+        }
        /* Update the reserved block counts only once the new group is
         * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 26147746c272..48ce561fafac 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -26,7 +26,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
@@ -41,6 +40,9 @@
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -50,8 +52,11 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
-struct proc_dir_entry *ext4_proc_root;
+static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
+struct ext4_lazy_init *ext4_li_info;
+struct mutex ext4_li_mtx;
+struct ext4_features *ext4_feat;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -68,14 +73,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt);
+                       const char *dev_name, void *data);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext3",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -381,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb)
 void __ext4_error(struct super_block *sb, const char *function,
                  unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line, current->comm);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, current->comm, &vaf);
        va_end(args);
        ext4_handle_error(sb);
@@ -398,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
                      const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        es->s_last_error_block = cpu_to_le64(block);
        save_error_info(inode->i_sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
               inode->i_sb->s_id, function, line, inode->i_ino);
        if (block)
-                printk("block %llu: ", block);
+                printk(KERN_CONT "block %llu: ", block);
-        printk("comm %s: ", current->comm);
+        printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
-        vprintk(fmt, args);
-        printk("\n");
        va_end(args);
        ext4_handle_error(inode->i_sb);
 }
 void ext4_error_file(struct file *file, const char *function,
-                     unsigned int line, const char *fmt, ...)
+                     unsigned int line, ext4_fsblk_t block,
+                     const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es;
        struct inode *inode = file->f_dentry->d_inode;
        char pathname[80], *path;
@@ -427,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function,
        es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        save_error_info(inode->i_sb, function, line);
-        va_start(args, fmt);
        path = d_path(&(file->f_path), pathname, sizeof(pathname));
-        if (!path)
+        if (IS_ERR(path))
                path = "(unknown)";
        printk(KERN_CRIT
-               "EXT4-fs error (device %s): %s:%d: inode #%lu "
+               "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               "(comm %s path %s): ",
+               inode->i_sb->s_id, function, line, inode->i_ino);
-               inode->i_sb->s_id, function, line, inode->i_ino,
+        if (block)
-               current->comm, path);
+                printk(KERN_CONT "block %llu: ", block);
-        vprintk(fmt, args);
+        va_start(args, fmt);
-        printk("\n");
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
        va_end(args);
        ext4_handle_error(inode->i_sb);
@@ -536,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
                panic("EXT4-fs panic from previous error\n");
 }
-void ext4_msg (struct super_block * sb, const char *prefix,
+void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
-                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
 void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, &vaf);
        va_end(args);
 }
@@ -568,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
 __releases(bitlock)
 __acquires(bitlock)
 {
+        struct va_format vaf;
        va_list args;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(ino);
        es->s_last_error_block = cpu_to_le64(block);
        __save_error_info(sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
               sb->s_id, function, line, grp);
        if (ino)
-                printk("inode %lu: ", ino);
+                printk(KERN_CONT "inode %lu: ", ino);
        if (block)
-                printk("block %llu:", (unsigned long long) block);
+                printk(KERN_CONT "block %llu:", (unsigned long long) block);
-        vprintk(fmt, args);
+        printk(KERN_CONT "%pV\n", &vaf);
-        printk("\n");
        va_end(args);
        if (test_opt(sb, ERRORS_CONT)) {
@@ -640,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
-        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
        if (IS_ERR(bdev))
                goto fail;
        return bdev;
@@ -656,8 +673,7 @@ fail:
 */
 static int ext4_blkdev_put(struct block_device *bdev)
 {
-        bd_release(bdev);
+        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -702,13 +718,13 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        ext4_unregister_li_request(sb);
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        flush_workqueue(sbi->dio_unwritten_wq);
        destroy_workqueue(sbi->dio_unwritten_wq);
        lock_super(sb);
-        lock_kernel();
        if (sb->s_dirt)
                ext4_commit_super(sb, 1);
@@ -719,6 +735,7 @@ static void ext4_put_super(struct super_block *sb)
                        ext4_abort(sb, "Couldn't clean up the journal");
        }
+        del_timer(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
@@ -775,7 +792,6 @@ static void ext4_put_super(struct super_block *sb)
         * Now that we are completely done shutting down the
         * superblock, we need to actually destroy the kobject.
         */
-        unlock_kernel();
        unlock_super(sb);
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
@@ -801,32 +817,43 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
-        /*
-         * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
-         * therefore it can be null here.  Don't check it, just initialize
-         * jinode.
-         */
-        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
        ei->i_da_metadata_calc_len = 0;
-        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
 #endif
+        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_completed_io_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->cur_aio_dio = NULL;
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
+        atomic_set(&ei->i_ioend_count, 0);
        return &ei->vfs_inode;
 }
+static int ext4_drop_inode(struct inode *inode)
+{
+        int drop = generic_drop_inode(inode);
+        trace_ext4_drop_inode(inode, drop);
+        return drop;
+}
+static void ext4_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
 static void ext4_destroy_inode(struct inode *inode)
 {
+        ext4_ioend_wait(inode);
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
@@ -836,7 +863,7 @@ static void ext4_destroy_inode(struct inode *inode)
                                true);
                dump_stack();
        }
-        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+        call_rcu(&inode->i_rcu, ext4_i_callback);
 }
 static void init_once(void *foo)
@@ -874,9 +901,12 @@ void ext4_clear_inode(struct inode *inode)
        end_writeback(inode);
        dquot_drop(inode);
        ext4_discard_preallocations(inode);
-        if (EXT4_JOURNAL(inode))
+        if (EXT4_I(inode)->jinode) {
-                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
-                                       &EXT4_I(inode)->jinode);
+                                               EXT4_I(inode)->jinode);
+                jbd2_free_inode(EXT4_I(inode)->jinode);
+                EXT4_I(inode)->jinode = NULL;
+        }
 }
 static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -1009,6 +1039,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
                seq_puts(seq, ",nodelalloc");
+        if (test_opt(sb, MBLK_IO_SUBMIT))
+                seq_puts(seq, ",mblk_io_submit");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
        /*
@@ -1045,6 +1077,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
                seq_puts(seq, ",block_validity");
+        if (!test_opt(sb, INIT_INODE_TABLE))
+                seq_puts(seq, ",noinit_inode_table");
+        else if (sbi->s_li_wait_mult)
+                seq_printf(seq, ",init_inode_table=%u",
+                           (unsigned) sbi->s_li_wait_mult);
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -1123,7 +1161,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path);
+                         struct path *path);
 static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1160,6 +1198,7 @@ static const struct super_operations ext4_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
+        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .put_super      = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
@@ -1180,6 +1219,7 @@ static const struct super_operations ext4_nojournal_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
+        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .write_super    = ext4_write_super,
        .put_super      = ext4_put_super,
@@ -1214,11 +1254,12 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
        Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
-        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
-        Opt_block_validity, Opt_noblock_validity,
+        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
+        Opt_init_inode_table, Opt_noinit_inode_table,
 };
 static const match_table_t tokens = {
@@ -1278,6 +1319,8 @@ static const match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+        {Opt_mblk_io_submit, "mblk_io_submit"},
+        {Opt_nomblk_io_submit, "nomblk_io_submit"},
        {Opt_block_validity, "block_validity"},
        {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1289,6 +1332,9 @@ static const match_table_t tokens = {
        {Opt_dioread_lock, "dioread_lock"},
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
+        {Opt_init_inode_table, "init_itable=%u"},
+        {Opt_init_inode_table, "init_itable"},
+        {Opt_noinit_inode_table, "noinit_itable"},
        {Opt_err, NULL},
 };
@@ -1353,7 +1399,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
                sbi->s_qf_names[qtype] = NULL;
                return 0;
        }
-        set_opt(sbi->s_mount_opt, QUOTA);
+        set_opt(sb, QUOTA);
        return 1;
 }
@@ -1408,21 +1454,21 @@ static int parse_options(char *options, struct super_block *sb,
                switch (token) {
                case Opt_bsd_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, MINIX_DF);
+                        clear_opt(sb, MINIX_DF);
                        break;
                case Opt_minix_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, MINIX_DF);
+                        set_opt(sb, MINIX_DF);
                        break;
                case Opt_grpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, GRPID);
+                        set_opt(sb, GRPID);
                        break;
                case Opt_nogrpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, GRPID);
+                        clear_opt(sb, GRPID);
                        break;
                case Opt_resuid:
@@ -1440,38 +1486,38 @@ static int parse_options(char *options, struct super_block *sb,
                        /* *sb_block = match_int(&args[0]); */
                        break;
                case Opt_err_panic:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt(sb, ERRORS_PANIC);
                        break;
                case Opt_err_ro:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_RO);
+                        set_opt(sb, ERRORS_RO);
                        break;
                case Opt_err_cont:
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        set_opt(sb, ERRORS_CONT);
                        break;
                case Opt_nouid32:
-                        set_opt(sbi->s_mount_opt, NO_UID32);
+                        set_opt(sb, NO_UID32);
                        break;
                case Opt_debug:
-                        set_opt(sbi->s_mount_opt, DEBUG);
+                        set_opt(sb, DEBUG);
                        break;
                case Opt_oldalloc:
-                        set_opt(sbi->s_mount_opt, OLDALLOC);
+                        set_opt(sb, OLDALLOC);
                        break;
                case Opt_orlov:
-                        clear_opt(sbi->s_mount_opt, OLDALLOC);
+                        clear_opt(sb, OLDALLOC);
                        break;
 #ifdef CONFIG_EXT4_FS_XATTR
                case Opt_user_xattr:
-                        set_opt(sbi->s_mount_opt, XATTR_USER);
+                        set_opt(sb, XATTR_USER);
                        break;
                case Opt_nouser_xattr:
-                        clear_opt(sbi->s_mount_opt, XATTR_USER);
+                        clear_opt(sb, XATTR_USER);
                        break;
 #else
                case Opt_user_xattr:
@@ -1481,10 +1527,10 @@ static int parse_options(char *options, struct super_block *sb,
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
                case Opt_acl:
-                        set_opt(sbi->s_mount_opt, POSIX_ACL);
+                        set_opt(sb, POSIX_ACL);
                        break;
                case Opt_noacl:
-                        clear_opt(sbi->s_mount_opt, POSIX_ACL);
+                        clear_opt(sb, POSIX_ACL);
                        break;
 #else
                case Opt_acl:
@@ -1503,7 +1549,7 @@ static int parse_options(char *options, struct super_block *sb,
                                         "Cannot specify journal on remount");
                                return 0;
                        }
-                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
+                        set_opt(sb, UPDATE_JOURNAL);
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
@@ -1516,14 +1562,14 @@ static int parse_options(char *options, struct super_block *sb,
                        *journal_devnum = option;
                        break;
                case Opt_journal_checksum:
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_journal_async_commit:
-                        set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
+                        set_opt(sb, JOURNAL_ASYNC_COMMIT);
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_noload:
-                        set_opt(sbi->s_mount_opt, NOLOAD);
+                        set_opt(sb, NOLOAD);
                        break;
                case Opt_commit:
                        if (match_int(&args[0], &option))
@@ -1566,15 +1612,15 @@ static int parse_options(char *options, struct super_block *sb,
                                        return 0;
                                }
                        } else {
-                                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                                clear_opt(sb, DATA_FLAGS);
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
                case Opt_data_err_abort:
-                        set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        set_opt(sb, DATA_ERR_ABORT);
                        break;
                case Opt_data_err_ignore:
-                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        clear_opt(sb, DATA_ERR_ABORT);
                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
@@ -1614,12 +1660,12 @@ set_qf_format:
                        break;
                case Opt_quota:
                case Opt_usrquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, USRQUOTA);
+                        set_opt(sb, USRQUOTA);
                        break;
                case Opt_grpquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, GRPQUOTA);
+                        set_opt(sb, GRPQUOTA);
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
@@ -1627,9 +1673,9 @@ set_qf_format:
                                        "options when quota turned on");
                                return 0;
                        }
-                        clear_opt(sbi->s_mount_opt, QUOTA);
+                        clear_opt(sb, QUOTA);
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                        break;
 #else
                case Opt_quota:
@@ -1655,7 +1701,7 @@ set_qf_format:
                        sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
                        break;
                case Opt_nobarrier:
-                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        clear_opt(sb, BARRIER);
                        break;
                case Opt_barrier:
                        if (args[0].from) {
@@ -1664,9 +1710,9 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                set_opt(sbi->s_mount_opt, BARRIER);
+                                set_opt(sb, BARRIER);
                        else
-                                clear_opt(sbi->s_mount_opt, BARRIER);
+                                clear_opt(sb, BARRIER);
                        break;
                case Opt_ignore:
                        break;
@@ -1690,11 +1736,17 @@ set_qf_format:
                                 "Ignoring deprecated bh option");
                        break;
                case Opt_i_version:
-                        set_opt(sbi->s_mount_opt, I_VERSION);
+                        set_opt(sb, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
                case Opt_nodelalloc:
-                        clear_opt(sbi->s_mount_opt, DELALLOC);
+                        clear_opt(sb, DELALLOC);
+                        break;
+                case Opt_mblk_io_submit:
+                        set_opt(sb, MBLK_IO_SUBMIT);
+                        break;
+                case Opt_nomblk_io_submit:
+                        clear_opt(sb, MBLK_IO_SUBMIT);
                        break;
                case Opt_stripe:
                        if (match_int(&args[0], &option))
@@ -1704,13 +1756,13 @@ set_qf_format:
                        sbi->s_stripe = option;
                        break;
                case Opt_delalloc:
-                        set_opt(sbi->s_mount_opt, DELALLOC);
+                        set_opt(sb, DELALLOC);
                        break;
                case Opt_block_validity:
-                        set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        set_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_noblock_validity:
-                        clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        clear_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_inode_readahead_blks:
                        if (match_int(&args[0], &option))
@@ -1734,7 +1786,7 @@ set_qf_format:
                                                            option);
                        break;
                case Opt_noauto_da_alloc:
-                        set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                        set_opt(sb, NO_AUTO_DA_ALLOC);
                        break;
                case Opt_auto_da_alloc:
                        if (args[0].from) {
@@ -1743,21 +1795,35 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                                clear_opt(sb, NO_AUTO_DA_ALLOC);
                        else
-                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                                set_opt(sb,NO_AUTO_DA_ALLOC);
                        break;
                case Opt_discard:
-                        set_opt(sbi->s_mount_opt, DISCARD);
+                        set_opt(sb, DISCARD);
                        break;
                case Opt_nodiscard:
-                        clear_opt(sbi->s_mount_opt, DISCARD);
+                        clear_opt(sb, DISCARD);
                        break;
                case Opt_dioread_nolock:
-                        set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        set_opt(sb, DIOREAD_NOLOCK);
                        break;
                case Opt_dioread_lock:
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
+                        break;
+                case Opt_init_inode_table:
+                        set_opt(sb, INIT_INODE_TABLE);
+                        if (args[0].from) {
+                                if (match_int(&args[0], &option))
+                                        return 0;
+                        } else
+                                option = EXT4_DEF_LI_WAIT_MULT;
+                        if (option < 0)
+                                return 0;
+                        sbi->s_li_wait_mult = option;
+                        break;
+                case Opt_noinit_inode_table:
+                        clear_opt(sb, INIT_INODE_TABLE);
                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
@@ -1769,10 +1835,10 @@ set_qf_format:
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
                if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
                if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
                        ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1842,12 +1908,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-                                "bpg=%lu, ipg=%lu, mo=%04x]\n",
+                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
                        EXT4_INODES_PER_GROUP(sb),
-                        sbi->s_mount_opt);
+                        sbi->s_mount_opt, sbi->s_mount_opt2);
        return res;
 }
@@ -1877,14 +1943,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
        size = flex_group_count * sizeof(struct flex_groups);
        sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
-                sbi->s_flex_groups = vmalloc(size);
+                sbi->s_flex_groups = vzalloc(size);
-                if (sbi->s_flex_groups)
+                if (sbi->s_flex_groups == NULL) {
-                        memset(sbi->s_flex_groups, 0, size);
+                        ext4_msg(sb, KERN_ERR,
-        }
+                                 "not enough memory for %u flex groups",
-        if (sbi->s_flex_groups == NULL) {
+                                 flex_group_count);
-                ext4_msg(sb, KERN_ERR, "not enough memory for "
+                        goto failed;
-                                "%u flex groups", flex_group_count);
+                }
-                goto failed;
        }
        for (i = 0; i < sbi->s_groups_count; i++) {
@@ -1942,7 +2007,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
 }
 /* Called at mount-time, super-block is locked */
-static int ext4_check_descriptors(struct super_block *sb)
+static int ext4_check_descriptors(struct super_block *sb,
+                                  ext4_group_t *first_not_zeroed)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1951,7 +2017,7 @@ static int ext4_check_descriptors(struct super_block *sb)
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
        int flexbg_flag = 0;
-        ext4_group_t i;
+        ext4_group_t i, grp = sbi->s_groups_count;
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                flexbg_flag = 1;
@@ -1967,6 +2033,10 @@ static int ext4_check_descriptors(struct super_block *sb)
                        last_block = first_block +
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+                if ((grp == sbi->s_groups_count) &&
+                   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        grp = i;
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2004,6 +2074,8 @@ static int ext4_check_descriptors(struct super_block *sb)
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
+        if (NULL != first_not_zeroed)
+                *first_not_zeroed = grp;
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2376,6 +2448,7 @@ static struct ext4_attr ext4_attr_##_name = {			\
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)       \
@@ -2412,6 +2485,16 @@ static struct attribute *ext4_attrs[] = {
        NULL,
 };
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+static struct attribute *ext4_feat_attrs[] = {
+        ATTR_LIST(lazy_itable_init),
+        ATTR_LIST(batched_discard),
+        NULL,
+};
 static ssize_t ext4_attr_show(struct kobject *kobj,
                              struct attribute *attr, char *buf)
 {
@@ -2440,7 +2523,6 @@ static void ext4_sb_release(struct kobject *kobj)
        complete(&sbi->s_kobj_unregister);
 }
 static const struct sysfs_ops ext4_attr_ops = {
        .show   = ext4_attr_show,
        .store  = ext4_attr_store,
@@ -2452,6 +2534,17 @@ static struct kobj_type ext4_ktype = {
        .release        = ext4_sb_release,
 };
+static void ext4_feat_release(struct kobject *kobj)
+{
+        complete(&ext4_feat->f_kobj_unregister);
+}
+static struct kobj_type ext4_feat_ktype = {
+        .default_attrs  = ext4_feat_attrs,
+        .sysfs_ops      = &ext4_attr_ops,
+        .release        = ext4_feat_release,
+};
 /*
 * Check whether this filesystem can be mounted based on
 * the features present and the RDONLY/RDWR mount requested.
@@ -2542,6 +2635,368 @@ static void print_daily_error_info(unsigned long arg)
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
+static void ext4_lazyinode_timeout(unsigned long data)
+{
+        struct task_struct *p = (struct task_struct *)data;
+        wake_up_process(p);
+}
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+        struct ext4_group_desc *gdp = NULL;
+        ext4_group_t group, ngroups;
+        struct super_block *sb;
+        unsigned long timeout = 0;
+        int ret = 0;
+        sb = elr->lr_super;
+        ngroups = EXT4_SB(sb)->s_groups_count;
+        for (group = elr->lr_next_group; group < ngroups; group++) {
+                gdp = ext4_get_group_desc(sb, group, NULL);
+                if (!gdp) {
+                        ret = 1;
+                        break;
+                }
+                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        break;
+        }
+        if (group == ngroups)
+                ret = 1;
+        if (!ret) {
+                timeout = jiffies;
+                ret = ext4_init_inode_table(sb, group,
+                                            elr->lr_timeout ? 0 : 1);
+                if (elr->lr_timeout == 0) {
+                        timeout = jiffies - timeout;
+                        if (elr->lr_sbi->s_li_wait_mult)
+                                timeout *= elr->lr_sbi->s_li_wait_mult;
+                        else
+                                timeout *= 20;
+                        elr->lr_timeout = timeout;
+                }
+                elr->lr_next_sched = jiffies + elr->lr_timeout;
+                elr->lr_next_group = group + 1;
+        }
+        return ret;
+}
+/*
+ * Remove lr_request from the list_request and free the
+ * request tructure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+        struct ext4_sb_info *sbi;
+        if (!elr)
+                return;
+        sbi = elr->lr_sbi;
+        list_del(&elr->lr_request);
+        sbi->s_li_request = NULL;
+        kfree(elr);
+}
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+        struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+        if (!ext4_li_info)
+                return;
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        ext4_remove_li_request(elr);
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+        struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+        struct list_head *pos, *n;
+        struct ext4_li_request *elr;
+        unsigned long next_wakeup;
+        DEFINE_WAIT(wait);
+        BUG_ON(NULL == eli);
+        eli->li_timer.data = (unsigned long)current;
+        eli->li_timer.function = ext4_lazyinode_timeout;
+        eli->li_task = current;
+        wake_up(&eli->li_wait_task);
+cont_thread:
+        while (true) {
+                next_wakeup = MAX_JIFFY_OFFSET;
+                mutex_lock(&eli->li_list_mtx);
+                if (list_empty(&eli->li_request_list)) {
+                        mutex_unlock(&eli->li_list_mtx);
+                        goto exit_thread;
+                }
+                list_for_each_safe(pos, n, &eli->li_request_list) {
+                        elr = list_entry(pos, struct ext4_li_request,
+                                         lr_request);
+                        if (time_after_eq(jiffies, elr->lr_next_sched)) {
+                                if (ext4_run_li_request(elr) != 0) {
+                                        /* error, remove the lazy_init job */
+                                        ext4_remove_li_request(elr);
+                                        continue;
+                                }
+                        }
+                        if (time_before(elr->lr_next_sched, next_wakeup))
+                                next_wakeup = elr->lr_next_sched;
+                }
+                mutex_unlock(&eli->li_list_mtx);
+                if (freezing(current))
+                        refrigerator();
+                if ((time_after_eq(jiffies, next_wakeup)) ||
+                    (MAX_JIFFY_OFFSET == next_wakeup)) {
+                        cond_resched();
+                        continue;
+                }
+                eli->li_timer.expires = next_wakeup;
+                add_timer(&eli->li_timer);
+                prepare_to_wait(&eli->li_wait_daemon, &wait,
+                                TASK_INTERRUPTIBLE);
+                if (time_before(jiffies, next_wakeup))
+                        schedule();
+                finish_wait(&eli->li_wait_daemon, &wait);
+        }
+exit_thread:
+        /*
+         * It looks like the request list is empty, but we need
+         * to check it under the li_list_mtx lock, to prevent any
+         * additions into it, and of course we should lock ext4_li_mtx
+         * to atomically free the list and ext4_li_info, because at
+         * this point another ext4 filesystem could be registering
+         * new one.
+         */
+        mutex_lock(&ext4_li_mtx);
+        mutex_lock(&eli->li_list_mtx);
+        if (!list_empty(&eli->li_request_list)) {
+                mutex_unlock(&eli->li_list_mtx);
+                mutex_unlock(&ext4_li_mtx);
+                goto cont_thread;
+        }
+        mutex_unlock(&eli->li_list_mtx);
+        del_timer_sync(&ext4_li_info->li_timer);
+        eli->li_task = NULL;
+        wake_up(&eli->li_wait_task);
+        kfree(ext4_li_info);
+        ext4_li_info = NULL;
+        mutex_unlock(&ext4_li_mtx);
+        return 0;
+}
+static void ext4_clear_request_list(void)
+{
+        struct list_head *pos, *n;
+        struct ext4_li_request *elr;
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+                elr = list_entry(pos, struct ext4_li_request,
+                                 lr_request);
+                ext4_remove_li_request(elr);
+        }
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+static int ext4_run_lazyinit_thread(void)
+{
+        struct task_struct *t;
+        t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+        if (IS_ERR(t)) {
+                int err = PTR_ERR(t);
+                ext4_clear_request_list();
+                del_timer_sync(&ext4_li_info->li_timer);
+                kfree(ext4_li_info);
+                ext4_li_info = NULL;
+                printk(KERN_CRIT "EXT4: error %d creating inode table "
+                                 "initialization thread\n",
+                                 err);
+                return err;
+        }
+        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+        wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
+        return 0;
+}
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+        ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+        struct ext4_group_desc *gdp = NULL;
+        for (group = 0; group < ngroups; group++) {
+                gdp = ext4_get_group_desc(sb, group, NULL);
+                if (!gdp)
+                        continue;
+                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        break;
+        }
+        return group;
+}
+static int ext4_li_info_new(void)
+{
+        struct ext4_lazy_init *eli = NULL;
+        eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+        if (!eli)
+                return -ENOMEM;
+        eli->li_task = NULL;
+        INIT_LIST_HEAD(&eli->li_request_list);
+        mutex_init(&eli->li_list_mtx);
+        init_waitqueue_head(&eli->li_wait_daemon);
+        init_waitqueue_head(&eli->li_wait_task);
+        init_timer(&eli->li_timer);
+        eli->li_state |= EXT4_LAZYINIT_QUIT;
+        ext4_li_info = eli;
+        return 0;
+}
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+                                            ext4_group_t start)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_li_request *elr;
+        unsigned long rnd;
+        elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+        if (!elr)
+                return NULL;
+        elr->lr_super = sb;
+        elr->lr_sbi = sbi;
+        elr->lr_next_group = start;
+        /*
+         * Randomize first schedule time of the request to
+         * spread the inode table initialization requests
+         * better.
+         */
+        get_random_bytes(&rnd, sizeof(rnd));
+        elr->lr_next_sched = jiffies + (unsigned long)rnd %
+                             (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+        return elr;
+}
+static int ext4_register_li_request(struct super_block *sb,
+                                    ext4_group_t first_not_zeroed)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_li_request *elr;
+        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        int ret = 0;
+        if (sbi->s_li_request != NULL)
+                return 0;
+        if (first_not_zeroed == ngroups ||
+            (sb->s_flags & MS_RDONLY) ||
+            !test_opt(sb, INIT_INODE_TABLE)) {
+                sbi->s_li_request = NULL;
+                return 0;
+        }
+        if (first_not_zeroed == ngroups) {
+                sbi->s_li_request = NULL;
+                return 0;
+        }
+        elr = ext4_li_request_new(sb, first_not_zeroed);
+        if (!elr)
+                return -ENOMEM;
+        mutex_lock(&ext4_li_mtx);
+        if (NULL == ext4_li_info) {
+                ret = ext4_li_info_new();
+                if (ret)
+                        goto out;
+        }
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+        sbi->s_li_request = elr;
+        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+                ret = ext4_run_lazyinit_thread();
+                if (ret)
+                        goto out;
+        }
+out:
+        mutex_unlock(&ext4_li_mtx);
+        if (ret)
+                kfree(elr);
+        return ret;
+}
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+        /*
+         * If thread exited earlier
+         * there's nothing to be done.
+         */
+        if (!ext4_li_info)
+                return;
+        ext4_clear_request_list();
+        while (ext4_li_info->li_task) {
+                wake_up(&ext4_li_info->li_wait_daemon);
+                wait_event(ext4_li_info->li_wait_task,
+                           ext4_li_info->li_task == NULL);
+        }
+}
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
@@ -2567,6 +3022,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        __u64 blocks_count;
        int err;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+        ext4_group_t first_not_zeroed;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
@@ -2588,8 +3044,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                sbi->s_sectors_written_start =
                        part_stat_read(sb->s_bdev->bd_part, sectors[1]);
-        unlock_kernel();
        /* Cleanup superblock name */
        for (cp = sb->s_id; (cp = strchr(cp, '/'));)
                *cp = '!';
@@ -2629,40 +3083,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
-                set_opt(sbi->s_mount_opt, DEBUG);
+                set_opt(sb, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
                ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
                        "2.6.38");
-                set_opt(sbi->s_mount_opt, GRPID);
+                set_opt(sb, GRPID);
        }
        if (def_mount_opts & EXT4_DEFM_UID16)
-                set_opt(sbi->s_mount_opt, NO_UID32);
+                set_opt(sb, NO_UID32);
 #ifdef CONFIG_EXT4_FS_XATTR
        if (def_mount_opts & EXT4_DEFM_XATTR_USER)
-                set_opt(sbi->s_mount_opt, XATTR_USER);
+                set_opt(sb, XATTR_USER);
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        if (def_mount_opts & EXT4_DEFM_ACL)
-                set_opt(sbi->s_mount_opt, POSIX_ACL);
+                set_opt(sb, POSIX_ACL);
 #endif
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-                set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-                set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                set_opt(sb, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                set_opt(sb, ERRORS_PANIC);
        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
-                set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                set_opt(sb, ERRORS_CONT);
        else
-                set_opt(sbi->s_mount_opt, ERRORS_RO);
+                set_opt(sb, ERRORS_RO);
        if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
-                set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
-                set_opt(sbi->s_mount_opt, DISCARD);
+                set_opt(sb, DISCARD);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -2671,7 +3126,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
-                set_opt(sbi->s_mount_opt, BARRIER);
+                set_opt(sb, BARRIER);
        /*
         * enable delayed allocation by default
@@ -2679,7 +3134,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        if (!IS_EXT3_SB(sb) &&
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
-                set_opt(sbi->s_mount_opt, DELALLOC);
+                set_opt(sb, DELALLOC);
        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
                           &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -2831,15 +3286,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
-        if ((ext4_blocks_count(es) >
+        err = generic_check_addressable(sb->s_blocksize_bits,
-             (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+                                        ext4_blocks_count(es));
-            (ext4_blocks_count(es) >
+        if (err) {
-             (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                if (sizeof(sector_t) < 8)
                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
-                ret = -EFBIG;
+                ret = err;
                goto failed_mount;
        }
@@ -2908,7 +3362,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount2;
                }
        }
-        if (!ext4_check_descriptors(sb)) {
+        if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
@@ -2924,6 +3378,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                        ext4_count_free_blocks(sb));
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                ext4_count_free_inodes(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                ext4_count_dirs(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        }
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
+                goto failed_mount3;
+        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_max_writeback_mb_bump = 128;
@@ -2965,8 +3437,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "suppressed and not mounted read-only");
                goto failed_mount_wq;
        } else {
-                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                clear_opt(sb, DATA_FLAGS);
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
                sbi->s_journal = NULL;
                needs_recovery = 0;
                goto no_journal;
@@ -3004,9 +3476,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 */
                if (jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
-                        set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                        set_opt(sb, ORDERED_DATA);
                else
-                        set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                        set_opt(sb, JOURNAL_DATA);
                break;
        case EXT4_MOUNT_ORDERED_DATA:
@@ -3022,22 +3494,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
-no_journal:
+        /*
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+         * The journal may have updated the bg summary counts, so we
-                                  ext4_count_free_blocks(sb));
+         * need to update the global counters.
-        if (!err)
+         */
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+        percpu_counter_set(&sbi->s_freeblocks_counter,
-                                          ext4_count_free_inodes(sb));
+                           ext4_count_free_blocks(sb));
-        if (!err)
+        percpu_counter_set(&sbi->s_freeinodes_counter,
-                err = percpu_counter_init(&sbi->s_dirs_counter,
+                           ext4_count_free_inodes(sb));
-                                          ext4_count_dirs(sb));
+        percpu_counter_set(&sbi->s_dirs_counter,
-        if (!err)
+                           ext4_count_dirs(sb));
-                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "insufficient memory");
-                goto failed_mount_wq;
-        }
+no_journal:
        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3099,18 +3568,18 @@ no_journal:
            (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
                         "requested data journaling mode");
-                clear_opt(sbi->s_mount_opt, DELALLOC);
+                clear_opt(sb, DELALLOC);
        }
        if (test_opt(sb, DIOREAD_NOLOCK)) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - requested data journaling mode");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
                if (sb->s_blocksize < PAGE_SIZE) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - block size is too small");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
        }
@@ -3129,6 +3598,10 @@ no_journal:
                goto failed_mount4;
        }
+        err = ext4_register_li_request(sb, first_not_zeroed);
+        if (err)
+                goto failed_mount4;
        sbi->s_kobj.kset = ext4_kset;
        init_completion(&sbi->s_kobj_unregister);
        err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3166,7 +3639,6 @@ no_journal:
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
-        lock_kernel();
        kfree(orig_data);
        return 0;
@@ -3184,10 +3656,6 @@ failed_mount_wq:
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
-        percpu_counter_destroy(&sbi->s_freeinodes_counter);
-        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3195,6 +3663,10 @@ failed_mount3:
                else
                        kfree(sbi->s_flex_groups);
        }
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -3213,7 +3685,6 @@ out_fail:
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        lock_kernel();
 out_free_orig:
        kfree(orig_data);
        return ret;
@@ -3306,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        if (bdev == NULL)
                return NULL;
-        if (bd_claim(bdev, sb)) {
-                ext4_msg(sb, KERN_ERR,
-                        "failed to claim external journal device");
-                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-                return NULL;
-        }
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
@@ -3470,7 +3934,7 @@ static int ext4_load_journal(struct super_block *sb,
        EXT4_SB(sb)->s_journal = journal;
        ext4_clear_journal_err(sb, es);
-        if (journal_devnum &&
+        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
@@ -3524,9 +3988,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-                                        &EXT4_SB(sb)->s_freeblocks_counter));
+                                           &EXT4_SB(sb)->s_freeblocks_counter));
-        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+        es->s_free_inodes_count =
-                                        &EXT4_SB(sb)->s_freeinodes_counter));
+                cpu_to_le32(percpu_counter_sum_positive(
+                                &EXT4_SB(sb)->s_freeinodes_counter));
        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
@@ -3706,6 +4171,22 @@ static int ext4_unfreeze(struct super_block *sb)
        return 0;
 }
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+        unsigned long s_mount_opt;
+        unsigned long s_mount_opt2;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned long s_commit_interval;
+        u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+        int s_jquota_fmt;
+        char *s_qf_names[MAXQUOTAS];
+#endif
+};
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
        struct ext4_super_block *es;
@@ -3722,12 +4203,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
        char *orig_data = kstrdup(data, GFP_KERNEL);
-        lock_kernel();
        /* Store the original options */
        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
+        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -3846,6 +4326,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        enable_quota = 1;
                }
        }
+        /*
+         * Reinitialize lazy itable initialization thread based on
+         * current settings
+         */
+        if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+                ext4_unregister_li_request(sb);
+        else {
+                ext4_group_t first_not_zeroed;
+                first_not_zeroed = ext4_has_uninit_itable(sb);
+                ext4_register_li_request(sb, first_not_zeroed);
+        }
        ext4_setup_system_zone(sb);
        if (sbi->s_journal == NULL)
                ext4_commit_super(sb, 1);
@@ -3858,7 +4351,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        kfree(old_opts.s_qf_names[i]);
 #endif
        unlock_super(sb);
-        unlock_kernel();
        if (enable_quota)
                dquot_resume(sb, -1);
@@ -3869,6 +4361,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
+        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -3884,7 +4377,6 @@ restore_opts:
        }
 #endif
        unlock_super(sb);
-        unlock_kernel();
        kfree(orig_data);
        return err;
 }
@@ -4066,27 +4558,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
 * Standard function to be called on quota_on
 */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name)
+                         struct path *path)
 {
        int err;
-        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        err = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (err)
-                return err;
        /* Quotafile not on the same filesystem? */
-        if (path.mnt->mnt_sb != sb) {
+        if (path->mnt->mnt_sb != sb)
-                path_put(&path);
                return -EXDEV;
-        }
        /* Journaling quota? */
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
-                if (path.dentry->d_parent != sb->s_root)
+                if (path->dentry->d_parent != sb->s_root)
                        ext4_msg(sb, KERN_WARNING,
                                "Quota file not on filesystem root. "
                                "Journaled quota will not work");
@@ -4097,7 +4582,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
         * all updates to the file when we bypass pagecache...
         */
        if (EXT4_SB(sb)->s_journal &&
-            ext4_should_journal_data(path.dentry->d_inode)) {
+            ext4_should_journal_data(path->dentry->d_inode)) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
@@ -4105,25 +4590,19 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
                err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-                if (err) {
+                if (err)
-                        path_put(&path);
                        return err;
-                }
        }
-        err = dquot_quota_on_path(sb, type, format_id, &path);
+        return dquot_quota_on(sb, type, format_id, path);
-        path_put(&path);
-        return err;
 }
 static int ext4_quota_off(struct super_block *sb, int type)
 {
-        /* Force all delayed allocation blocks to be allocated */
+        /* Force all delayed allocation blocks to be allocated.
-        if (test_opt(sb, DELALLOC)) {
+         * Caller already holds s_umount sem */
-                down_read(&sb->s_umount);
+        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
-                up_read(&sb->s_umount);
-        }
        return dquot_quota_off(sb, type);
 }
@@ -4229,17 +4708,17 @@ out:
 #endif
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
 }
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext2",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -4284,28 +4763,58 @@ static inline void unregister_as_ext3(void) { }
 static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
-static int __init init_ext4_fs(void)
+int __init ext4_init_feat_adverts(void)
+{
+        struct ext4_features *ef;
+        int ret = -ENOMEM;
+        ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+        if (!ef)
+                goto out;
+        ef->f_kobj.kset = ext4_kset;
+        init_completion(&ef->f_kobj_unregister);
+        ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+                                   "features");
+        if (ret) {
+                kfree(ef);
+                goto out;
+        }
+        ext4_feat = ef;
+        ret = 0;
+out:
+        return ret;
+}
+static int __init ext4_init_fs(void)
 {
        int err;
        ext4_check_flag_values();
-        err = init_ext4_system_zone();
+        err = ext4_init_pageio();
        if (err)
                return err;
+        err = ext4_init_system_zone();
+        if (err)
+                goto out5;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
                goto out4;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-        err = init_ext4_mballoc();
+        err = ext4_init_feat_adverts();
+        err = ext4_init_mballoc();
        if (err)
                goto out3;
-        err = init_ext4_xattr();
+        err = ext4_init_xattr();
        if (err)
                goto out2;
        err = init_inodecache();
@@ -4316,38 +4825,46 @@ static int __init init_ext4_fs(void)
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
+        ext4_li_info = NULL;
+        mutex_init(&ext4_li_mtx);
        return 0;
 out:
        unregister_as_ext2();
        unregister_as_ext3();
        destroy_inodecache();
 out1:
-        exit_ext4_xattr();
+        ext4_exit_xattr();
 out2:
-        exit_ext4_mballoc();
+        ext4_exit_mballoc();
 out3:
+        kfree(ext4_feat);
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
 out4:
-        exit_ext4_system_zone();
+        ext4_exit_system_zone();
+out5:
+        ext4_exit_pageio();
        return err;
 }
-static void __exit exit_ext4_fs(void)
+static void __exit ext4_exit_fs(void)
 {
+        ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
-        exit_ext4_xattr();
+        ext4_exit_xattr();
-        exit_ext4_mballoc();
+        ext4_exit_mballoc();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
-        exit_ext4_system_zone();
+        ext4_exit_system_zone();
+        ext4_exit_pageio();
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
 MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
-module_init(init_ext4_fs)
+module_init(ext4_init_fs)
-module_exit(exit_ext4_fs)
+module_exit(ext4_exit_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a8cd8dff1ad..fc32176eee39 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
 static int
 ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        int i_error, b_error;
+        int ret, ret2;
        down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+        ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
-        if (i_error < 0) {
+        if (ret < 0)
-                b_error = 0;
+                goto errout;
-        } else {
+        if (buffer) {
-                if (buffer) {
+                buffer += ret;
-                        buffer += i_error;
+                buffer_size -= ret;
-                        buffer_size -= i_error;
-                }
-                b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
-                if (b_error < 0)
-                        i_error = 0;
        }
+        ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+        if (ret < 0)
+                goto errout;
+        ret += ret2;
+errout:
        up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        return i_error + b_error;
+        return ret;
 }
 /*
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext4_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
@@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 #undef BLOCK_HASH_SHIFT
 int __init
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
        ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
        if (!ext4_xattr_cache)
@@ -1597,7 +1597,7 @@ init_ext4_xattr(void)
 }
 void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
        if (ext4_xattr_cache)
                mb_cache_destroy(ext4_xattr_cache);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 518e96e43905..1ef16520b950 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
-extern int init_ext4_xattr(void);
+extern int __init ext4_init_xattr(void);
-extern void exit_ext4_xattr(void);
+extern void ext4_exit_xattr(void);
 extern const struct xattr_handler *ext4_xattr_handlers[];
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
 {
 }
-static inline int
+static __init inline int
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
        return 0;
 }
 static inline void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
 }
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index d75a77f85c28..f50408901f7e 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,7 +319,8 @@ extern struct inode *fat_build_inode(struct super_block *sb,
                        struct msdos_dir_entry *de, loff_t i_pos);
 extern int fat_sync_inode(struct inode *inode);
 extern int fat_fill_super(struct super_block *sb, void *data, int silent,
-                        const struct inode_operations *fs_dir_inode_ops, int isvfat);
+                        const struct inode_operations *fs_dir_inode_ops,
+                        int isvfat, void (*setup)(struct super_block *));
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a3..b47d2c9f4fa1 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,8 @@ int fat_free_clusters(struct inode *inode, int cluster)
                                sb_issue_discard(sb,
                                        fat_clus_to_blknr(sbi, first_cl),
-                                        nr_clus * sbi->sec_per_clus);
+                                        nr_clus * sbi->sec_per_clus,
+                                        GFP_NOFS, 0);
                                first_cl = cluster;
                        }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 830058057d33..86753fe10bd1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -14,7 +14,6 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
@@ -489,8 +488,6 @@ static void fat_put_super(struct super_block *sb)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
-        lock_kernel();
        if (sb->s_dirt)
                fat_write_super(sb);
@@ -504,8 +501,6 @@ static void fat_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
        kfree(sbi);
-        unlock_kernel();
 }
 static struct kmem_cache *fat_inode_cachep;
@@ -519,11 +514,18 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void fat_destroy_inode(struct inode *inode)
+static void fat_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
+static void fat_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, fat_i_callback);
+}
 static void init_once(void *foo)
 {
        struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
@@ -701,7 +703,6 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
 {
        struct inode *inode = NULL;
-        struct dentry *result;
        u32 *fh = fid->raw;
        if (fh_len < 5 || fh_type != 3)
@@ -746,10 +747,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
         * the fat_iget lookup again.  If that fails, then we are totally out
         * of luck.  But all that is for another day
         */
-        result = d_obtain_alias(inode);
+        return d_obtain_alias(inode);
-        if (!IS_ERR(result))
-                result->d_op = sb->s_root->d_op;
-        return result;
 }
 static int
@@ -797,8 +795,6 @@ static struct dentry *fat_get_parent(struct dentry *child)
        brelse(bh);
        parent = d_obtain_alias(inode);
-        if (!IS_ERR(parent))
-                parent->d_op = sb->s_root->d_op;
 out:
        unlock_super(sb);
@@ -1242,7 +1238,8 @@ static int fat_read_root(struct inode *inode)
 * Read the super block of an MS-DOS FS.
 */
 int fat_fill_super(struct super_block *sb, void *data, int silent,
-                   const struct inode_operations *fs_dir_inode_ops, int isvfat)
+                   const struct inode_operations *fs_dir_inode_ops, int isvfat,
+                   void (*setup)(struct super_block *))
 {
        struct inode *root_inode = NULL, *fat_inode = NULL;
        struct buffer_head *bh;
@@ -1278,6 +1275,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        if (error)
                goto out_fail;
+        setup(sb); /* flavour-specific stuff that needs options */
        error = -EIO;
        sb_min_blocksize(sb, 512);
        bh = sb_bread(sb, 0);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1736f2356388..970e682ea754 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -255,10 +255,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
        for (i = 0; i < nr_bhs; i++) {
                wait_on_buffer(bhs[i]);
-                if (buffer_eopnotsupp(bhs[i])) {
+                if (!err && !buffer_uptodate(bhs[i]))
-                        clear_buffer_eopnotsupp(bhs[i]);
-                        err = -EOPNOTSUPP;
-                } else if (!err && !buffer_uptodate(bhs[i]))
                        err = -EIO;
        }
        return err;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index bbc94ae4fd77..711499040eb6 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,7 +148,8 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
 * that the existing dentry can be used. The msdos fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
+static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
+               struct qstr *qstr)
 {
        struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
        unsigned char msdos_name[MSDOS_NAME];
@@ -164,16 +165,18 @@ static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
 * Compare two msdos names. If either of the names are invalid,
 * we fall back to doing the standard name comparison.
 */
-static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+        struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
        unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
        int error;
-        error = msdos_format_name(a->name, a->len, a_msdos_name, options);
+        error = msdos_format_name(name->name, name->len, a_msdos_name, options);
        if (error)
                goto old_compare;
-        error = msdos_format_name(b->name, b->len, b_msdos_name, options);
+        error = msdos_format_name(str, len, b_msdos_name, options);
        if (error)
                goto old_compare;
        error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
@@ -182,8 +185,8 @@ out:
 old_compare:
        error = 1;
-        if (a->len == b->len)
+        if (name->len == len)
-                error = memcmp(a->name, b->name, a->len);
+                error = memcmp(name->name, str, len);
        goto out;
 }
@@ -224,11 +227,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        }
 out:
        unlock_super(sb);
-        dentry->d_op = &msdos_dentry_operations;
+        return d_splice_alias(inode, dentry);
-        dentry = d_splice_alias(inode, dentry);
-        if (dentry)
-                dentry->d_op = &msdos_dentry_operations;
-        return dentry;
 error:
        unlock_super(sb);
@@ -658,31 +657,29 @@ static const struct inode_operations msdos_dir_inode_operations = {
        .getattr        = fat_getattr,
 };
-static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+static void setup(struct super_block *sb)
 {
-        int res;
+        sb->s_d_op = &msdos_dentry_operations;
-        res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
-        if (res)
-                return res;
        sb->s_flags |= MS_NOATIME;
-        sb->s_root->d_op = &msdos_dentry_operations;
-        return 0;
 }
-static int msdos_get_sb(struct file_system_type *fs_type,
+static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+{
+        return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
+                             0, setup);
+}
+static struct dentry *msdos_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
-                           mnt);
 }
 static struct file_system_type msdos_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "msdos",
-        .get_sb         = msdos_get_sb,
+        .mount          = msdos_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6fcc7e71fbaa..f88f752babd9 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,6 +43,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /* This is not negative dentry. Always valid. */
        if (dentry->d_inode)
                return 1;
@@ -51,6 +54,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /*
         * This is not negative dentry. Always valid.
         *
@@ -85,22 +91,26 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 }
 /* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int vfat_striptail_len(struct qstr *qstr)
+static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
 {
-        unsigned int len = qstr->len;
+        while (len && name[len - 1] == '.')
-        while (len && qstr->name[len - 1] == '.')
                len--;
        return len;
 }
+static unsigned int vfat_striptail_len(const struct qstr *qstr)
+{
+        return __vfat_striptail_len(qstr->len, qstr->name);
+}
 /*
 * Compute the hash for the vfat name corresponding to the dentry.
 * Note: if the name is invalid, we leave the hash code unchanged so
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
        return 0;
@@ -112,9 +122,10 @@ static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+        struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
        const unsigned char *name;
        unsigned int len;
        unsigned long hash;
@@ -133,16 +144,18 @@ static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
 /*
 * Case insensitive compare of two vfat names.
 */
-static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+        struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
        unsigned int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = vfat_striptail_len(a);
+        alen = vfat_striptail_len(name);
-        blen = vfat_striptail_len(b);
+        blen = __vfat_striptail_len(len, str);
        if (alen == blen) {
-                if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+                if (nls_strnicmp(t, name->name, str, alen) == 0)
                        return 0;
        }
        return 1;
@@ -151,15 +164,17 @@ static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
 /*
 * Case sensitive compare of two vfat names.
 */
-static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        unsigned int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = vfat_striptail_len(a);
+        alen = vfat_striptail_len(name);
-        blen = vfat_striptail_len(b);
+        blen = __vfat_striptail_len(len, str);
        if (alen == blen) {
-                if (strncmp(a->name, b->name, alen) == 0)
+                if (strncmp(name->name, str, alen) == 0)
                        return 0;
        }
        return 1;
@@ -757,13 +772,10 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 out:
        unlock_super(sb);
-        dentry->d_op = sb->s_root->d_op;
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
-        if (dentry) {
+        if (dentry)
-                dentry->d_op = sb->s_root->d_op;
                dentry->d_time = dentry->d_parent->d_inode->i_version;
-        }
        return dentry;
 error:
@@ -1051,34 +1063,31 @@ static const struct inode_operations vfat_dir_inode_operations = {
        .getattr        = fat_getattr,
 };
-static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+static void setup(struct super_block *sb)
 {
-        int res;
-        res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
-        if (res)
-                return res;
        if (MSDOS_SB(sb)->options.name_check != 's')
-                sb->s_root->d_op = &vfat_ci_dentry_ops;
+                sb->s_d_op = &vfat_ci_dentry_ops;
        else
-                sb->s_root->d_op = &vfat_dentry_ops;
+                sb->s_d_op = &vfat_dentry_ops;
+}
-        return 0;
+static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+{
+        return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
+                             1, setup);
 }
-static int vfat_get_sb(struct file_system_type *fs_type,
+static struct dentry *vfat_mount(struct file_system_type *fs_type,
                       int flags, const char *dev_name,
-                       void *data, struct vfsmount *mnt)
+                       void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
-                           mnt);
 }
 static struct file_system_type vfat_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "vfat",
-        .get_sb         = vfat_get_sb,
+        .mount          = vfat_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f8cc34f542c3..cb1026181bdc 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head)
 * match the state "is the filp on a fasync list".
 *
 */
-static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
+int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
        struct fasync_struct *fa, **fp;
        int result = 0;
@@ -666,21 +666,31 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
        return result;
 }
+struct fasync_struct *fasync_alloc(void)
+{
+        return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+}
 /*
- * Add a fasync entry. Return negative on error, positive if
+ * NOTE! This can be used only for unused fasync entries:
- * added, and zero if did nothing but change an existing one.
+ * entries that actually got inserted on the fasync list
+ * need to be released by rcu - see fasync_remove_entry.
+ */
+void fasync_free(struct fasync_struct *new)
+{
+        kmem_cache_free(fasync_cache, new);
+}
+/*
+ * Insert a new entry into the fasync list.  Return the pointer to the
+ * old one if we didn't use the new one.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 */
-static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
 {
-        struct fasync_struct *new, *fa, **fp;
+        struct fasync_struct *fa, **fp;
-        int result = 0;
-        new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
-        if (!new)
-                return -ENOMEM;
        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
@@ -691,8 +701,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
                spin_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
                spin_unlock_irq(&fa->fa_lock);
-                kmem_cache_free(fasync_cache, new);
                goto out;
        }
@@ -702,13 +710,39 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
        new->fa_fd = fd;
        new->fa_next = *fapp;
        rcu_assign_pointer(*fapp, new);
-        result = 1;
        filp->f_flags |= FASYNC;
 out:
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
-        return result;
+        return fa;
+}
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+        struct fasync_struct *new;
+        new = fasync_alloc();
+        if (!new)
+                return -ENOMEM;
+        /*
+         * fasync_insert_entry() returns the old (update) entry if
+         * it existed.
+         *
+         * So free the (unused) new entry and return 0 to let the
+         * caller know that we didn't add any new fasync entries.
+         */
+        if (fasync_insert_entry(fd, filp, fapp, new)) {
+                fasync_free(new);
+                return 0;
+        }
+        return 1;
 }
 /*
@@ -781,7 +815,7 @@ static int __init fcntl_init(void)
                __O_SYNC        | O_DSYNC       | FASYNC        |
                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
-                FMODE_EXEC
+                __FMODE_EXEC
                ));
        fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fifo.c b/fs/fifo.c
index 5d6606ffc2d2..4e303c22d5ee 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -151,4 +151,5 @@ err_nocleanup:
 */
 const struct file_operations def_fifo_fops = {
        .open           = fifo_open,    /* will set read_ or write_pipefifo_fops */
+        .llseek         = noop_llseek,
 };
diff --git a/fs/file_table.c b/fs/file_table.c
index a04bdd81c11c..eb36b6b17e26 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
 /*
 * Return the total number of open files in the system
 */
-static int get_nr_files(void)
+static long get_nr_files(void)
 {
        return percpu_counter_read_positive(&nr_files);
 }
@@ -68,7 +68,7 @@ static int get_nr_files(void)
 /*
 * Return the maximum number of open files in the system
 */
-int get_max_files(void)
+unsigned long get_max_files(void)
 {
        return files_stat.max_files;
 }
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        files_stat.nr_files = get_nr_files();
-        return proc_dointvec(table, write, buffer, lenp, ppos);
+        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #else
 int proc_nr_files(ctl_table *table, int write,
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
 struct file *get_empty_filp(void)
 {
        const struct cred *cred = current_cred();
-        static int old_max;
+        static long old_max;
        struct file * f;
        /*
@@ -125,13 +125,13 @@ struct file *get_empty_filp(void)
                goto fail;
        percpu_counter_inc(&nr_files);
+        f->f_cred = get_cred(cred);
        if (security_file_alloc(f))
                goto fail_sec;
        INIT_LIST_HEAD(&f->f_u.fu_list);
        atomic_long_set(&f->f_count, 1);
        rwlock_init(&f->f_owner.lock);
-        f->f_cred = get_cred(cred);
        spin_lock_init(&f->f_lock);
        eventpoll_init_file(f);
        /* f->f_version: 0 */
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
 over:
        /* Ran out of filps - report that */
        if (get_nr_files() > old_max) {
-                printk(KERN_INFO "VFS: file-max limit %d reached\n",
+                pr_info("VFS: file-max limit %lu reached\n", get_max_files());
-                                        get_max_files());
                old_max = get_nr_files();
        }
        goto fail;
@@ -312,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
        struct files_struct *files = current->files;
        *fput_needed = 0;
-        if (likely((atomic_read(&files->count) == 1))) {
+        if (atomic_read(&files->count) == 1) {
                file = fcheck_files(files, fd);
        } else {
                rcu_read_lock();
@@ -487,7 +486,7 @@ retry:
 void __init files_init(unsigned long mempages)
 { 
-        int n; 
+        unsigned long n;
        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
         */ 
        n = (mempages * (PAGE_SIZE / 1024)) / 10;
-        files_stat.max_files = n; 
+        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-        if (files_stat.max_files < NR_FILE)
-                files_stat.max_files = NR_FILE;
        files_defer_init();
        lg_lock_init(files_lglock);
        percpu_counter_init(&nr_files, 0);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 68ba492d8eef..751d6b255a12 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs)
                tmp = &(*tmp)->next;
        }
        write_unlock(&file_systems_lock);
+        synchronize_rcu();
        return -EINVAL;
 }
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 79d1b4ea13e7..2ba6719ac612 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
        struct inode                    *ip = NULL;
        if ((ip = new_inode(sbp))) {
+                ip->i_ino = get_next_ino();
                vxfs_iinit(ip, vip);
                ip->i_mapping->a_ops = &vxfs_aops;
        }
@@ -336,6 +337,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
        return ip;
 }
+static void vxfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(vxfs_inode_cachep, inode->i_private);
+}
 /**
 * vxfs_evict_inode - remove inode from main memory
 * @ip:         inode to discard.
@@ -349,5 +357,5 @@ vxfs_evict_inode(struct inode *ip)
 {
        truncate_inode_pages(&ip->i_data, 0);
        end_writeback(ip);
-        kmem_cache_free(vxfs_inode_cachep, ip->i_private);
+        call_rcu(&ip->i_rcu, vxfs_i_callback);
 }
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 0ec7bb2c95c6..6c5131d592f0 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -36,7 +36,6 @@
 #include <linux/highmem.h>
 #include <linux/kernel.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include "vxfs.h"
 #include "vxfs_dir.h"
@@ -212,16 +211,12 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
        if (dp->d_name.len > VXFS_NAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
                                 
-        lock_kernel();
        ino = vxfs_inode_by_name(dip, dp);
        if (ino) {
                ip = vxfs_iget(dip->i_sb, ino);
-                if (IS_ERR(ip)) {
+                if (IS_ERR(ip))
-                        unlock_kernel();
                        return ERR_CAST(ip);
-                }
        }
-        unlock_kernel();
        d_add(dp, ip);
        return NULL;
 }
@@ -248,8 +243,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
        u_long                  page, npages, block, pblocks, nblocks, offset;
        loff_t                  pos;
-        lock_kernel();
        switch ((long)fp->f_pos) {
        case 0:
                if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
@@ -265,10 +258,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
        pos = fp->f_pos - 2;
        
-        if (pos > VXFS_DIRROUND(ip->i_size)) {
+        if (pos > VXFS_DIRROUND(ip->i_size))
-                unlock_kernel();
                return 0;
-        }
        npages = dir_pages(ip);
        nblocks = dir_blocks(ip);
@@ -327,6 +318,5 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 done:
        fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
 out:
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index dc0c041e85cb..9d1c99558389 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,7 +38,6 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
@@ -81,16 +80,12 @@ vxfs_put_super(struct super_block *sbp)
 {
        struct vxfs_sb_info     *infp = VXFS_SBI(sbp);
-        lock_kernel();
        vxfs_put_fake_inode(infp->vsi_fship);
        vxfs_put_fake_inode(infp->vsi_ilist);
        vxfs_put_fake_inode(infp->vsi_stilist);
        brelse(infp->vsi_bp);
        kfree(infp);
-        unlock_kernel();
 }
 /**
@@ -148,7 +143,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 *   The superblock on success, else %NULL.
 *
 * Locking:
- *   We are under the bkl and @sbp->s_lock.
+ *   We are under @sbp->s_lock.
 */
 static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 {
@@ -251,17 +246,16 @@ out:
 /*
 * The usual module blurb.
 */
-static int vxfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *vxfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
-                           mnt);
 }
 static struct file_system_type vxfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "vxfs",
-        .get_sb         = vxfs_get_sb,
+        .mount          = vxfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1e23c33ea5cf..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -79,13 +79,14 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
        return sb->s_bdi;
 }
-static void bdi_queue_work(struct backing_dev_info *bdi,
+static inline struct inode *wb_inode(struct list_head *head)
-                struct wb_writeback_work *work)
 {
-        trace_writeback_queue(bdi, work);
+        return list_entry(head, struct inode, i_wb_list);
+}
-        spin_lock_bh(&bdi->wb_lock);
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-        list_add_tail(&work->list, &bdi->work_list);
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
+{
        if (bdi->wb.task) {
                wake_up_process(bdi->wb.task);
        } else {
@@ -93,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
                 * The bdi thread isn't there, wake up the forker thread which
                 * will create and run it.
                 */
-                trace_writeback_nothread(bdi, work);
                wake_up_process(default_backing_dev_info.wb.task);
        }
+}
+static void bdi_queue_work(struct backing_dev_info *bdi,
+                           struct wb_writeback_work *work)
+{
+        trace_writeback_queue(bdi, work);
+        spin_lock_bh(&bdi->wb_lock);
+        list_add_tail(&work->list, &bdi->work_list);
+        if (!bdi->wb.task)
+                trace_writeback_nothread(bdi, work);
+        bdi_wakeup_flusher(bdi);
        spin_unlock_bh(&bdi->wb_lock);
 }
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                bool range_cyclic, bool for_background)
+                      bool range_cyclic)
 {
        struct wb_writeback_work *work;
@@ -121,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
        work->sync_mode = WB_SYNC_NONE;
        work->nr_pages  = nr_pages;
        work->range_cyclic = range_cyclic;
-        work->for_background = for_background;
        bdi_queue_work(bdi, work);
 }
@@ -139,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-        __bdi_start_writeback(bdi, nr_pages, true, false);
+        __bdi_start_writeback(bdi, nr_pages, true);
 }
 /**
@@ -147,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 * @bdi: the backing device to write from
 *
 * Description:
- *   This does WB_SYNC_NONE background writeback. The IO is only
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
- *   started when this function returns, we make no guarentees on
+ *   this function returns, it is only guaranteed that for given BDI
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
 */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-        __bdi_start_writeback(bdi, LONG_MAX, true, true);
+        /*
+         * We just wake up the flusher thread. It will perform background
+         * writeback as soon as there is no other work to do.
+         */
+        trace_writeback_wake_background(bdi);
+        spin_lock_bh(&bdi->wb_lock);
+        bdi_wakeup_flusher(bdi);
+        spin_unlock_bh(&bdi->wb_lock);
 }
 /*
@@ -172,11 +191,11 @@ static void redirty_tail(struct inode *inode)
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
-                tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+                tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
-        list_move(&inode->i_list, &wb->b_dirty);
+        list_move(&inode->i_wb_list, &wb->b_dirty);
 }
 /*
@@ -186,7 +205,7 @@ static void requeue_io(struct inode *inode)
 {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-        list_move(&inode->i_list, &wb->b_more_io);
+        list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 static void inode_sync_complete(struct inode *inode)
@@ -227,14 +246,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
        int do_sb_sort = 0;
        while (!list_empty(delaying_queue)) {
-                inode = list_entry(delaying_queue->prev, struct inode, i_list);
+                inode = wb_inode(delaying_queue->prev);
                if (older_than_this &&
                    inode_dirtied_after(inode, *older_than_this))
                        break;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
-                list_move(&inode->i_list, &tmp);
+                list_move(&inode->i_wb_list, &tmp);
        }
        /* just one sb in list, splice to dispatch_queue and we're done */
@@ -245,12 +264,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
        /* Move inodes from one superblock together */
        while (!list_empty(&tmp)) {
-                inode = list_entry(tmp.prev, struct inode, i_list);
+                sb = wb_inode(tmp.prev)->i_sb;
-                sb = inode->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
-                        inode = list_entry(pos, struct inode, i_list);
+                        inode = wb_inode(pos);
                        if (inode->i_sb == sb)
-                                list_move(&inode->i_list, dispatch_queue);
+                                list_move(&inode->i_wb_list, dispatch_queue);
                }
        }
 }
@@ -408,16 +426,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                         * completion.
                         */
                        redirty_tail(inode);
-                } else if (atomic_read(&inode->i_count)) {
-                        /*
-                         * The inode is clean, inuse
-                         */
-                        list_move(&inode->i_list, &inode_in_use);
                } else {
                        /*
-                         * The inode is clean, unused
+                         * The inode is clean.  At this point we either have
+                         * a reference to the inode or it's on it's way out.
+                         * No need to add it back to the LRU.
                         */
-                        list_move(&inode->i_list, &inode_unused);
+                        list_del_init(&inode->i_wb_list);
                }
        }
        inode_sync_complete(inode);
@@ -465,8 +480,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 {
        while (!list_empty(&wb->b_io)) {
                long pages_skipped;
-                struct inode *inode = list_entry(wb->b_io.prev,
+                struct inode *inode = wb_inode(wb->b_io.prev);
-                                                 struct inode, i_list);
                if (inode->i_sb != sb) {
                        if (only_this_sb) {
@@ -487,10 +501,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                        return 0;
                }
-                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+                /*
+                 * Don't bother with new inodes or inodes beeing freed, first
+                 * kind does not need peridic writeout yet, and for the latter
+                 * kind writeout is handled by the freer.
+                 */
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
@@ -498,7 +518,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                if (inode_dirtied_after(inode, wbc->wb_start))
                        return 1;
-                BUG_ON(inode->i_state & I_FREEING);
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                writeback_single_inode(inode, wbc);
@@ -536,8 +555,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
                queue_io(wb, wbc->older_than_this);
        while (!list_empty(&wb->b_io)) {
-                struct inode *inode = list_entry(wb->b_io.prev,
+                struct inode *inode = wb_inode(wb->b_io.prev);
-                                                 struct inode, i_list);
                struct super_block *sb = inode->i_sb;
                if (!pin_sb_for_writeback(sb)) {
@@ -582,7 +600,7 @@ static inline bool over_bground_thresh(void)
        global_dirty_limits(&background_thresh, &dirty_thresh);
        return (global_page_state(NR_FILE_DIRTY) +
-                global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+                global_page_state(NR_UNSTABLE_NFS) > background_thresh);
 }
 /*
@@ -612,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
        };
        unsigned long oldest_jif;
        long wrote = 0;
+        long write_chunk;
        struct inode *inode;
        if (wbc.for_kupdate) {
@@ -624,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
                wbc.range_end = LLONG_MAX;
        }
+        /*
+         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+         * here avoids calling into writeback_inodes_wb() more than once.
+         *
+         * The intended call sequence for WB_SYNC_ALL writeback is:
+         *
+         *      wb_writeback()
+         *          __writeback_inodes_sb()     <== called only once
+         *              write_cache_pages()     <== called once for each inode
+         *                   (quickly) tag currently dirty pages
+         *                   (maybe slowly) sync all tagged pages
+         */
+        if (wbc.sync_mode == WB_SYNC_NONE)
+                write_chunk = MAX_WRITEBACK_PAGES;
+        else
+                write_chunk = LONG_MAX;
        wbc.wb_start = jiffies; /* livelock avoidance */
        for (;;) {
                /*
@@ -633,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                /*
+                 * Background writeout and kupdate-style writeback may
+                 * run forever. Stop them if there is other work to do
+                 * so that e.g. sync can proceed. They'll be restarted
+                 * after the other works are all done.
+                 */
+                if ((work->for_background || work->for_kupdate) &&
+                    !list_empty(&wb->bdi->work_list))
+                        break;
+                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
@@ -640,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                wbc.more_io = 0;
-                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;
                trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -650,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
                        writeback_inodes_wb(wb, &wbc);
                trace_wbc_writeback_written(&wbc, wb->bdi);
-                work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                work->nr_pages -= write_chunk - wbc.nr_to_write;
-                wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                wrote += write_chunk - wbc.nr_to_write;
                /*
                 * If we consumed everything, see if we have more
@@ -666,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                /*
                 * Did we write something? Try for more
                 */
-                if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+                if (wbc.nr_to_write < write_chunk)
                        continue;
                /*
                 * Nothing written. Wait for some inode to
@@ -675,8 +722,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                 */
                spin_lock(&inode_lock);
                if (!list_empty(&wb->b_more_io))  {
-                        inode = list_entry(wb->b_more_io.prev,
+                        inode = wb_inode(wb->b_more_io.prev);
-                                                struct inode, i_list);
                        trace_wbc_writeback_wait(&wbc, wb->bdi);
                        inode_wait_for_writeback(inode);
                }
@@ -704,6 +750,34 @@ get_next_work_item(struct backing_dev_info *bdi)
        return work;
 }
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
+{
+        return global_page_state(NR_FILE_DIRTY) +
+                global_page_state(NR_UNSTABLE_NFS) +
+                get_nr_dirty_inodes();
+}
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+        if (over_bground_thresh()) {
+                struct wb_writeback_work work = {
+                        .nr_pages       = LONG_MAX,
+                        .sync_mode      = WB_SYNC_NONE,
+                        .for_background = 1,
+                        .range_cyclic   = 1,
+                };
+                return wb_writeback(wb, &work);
+        }
+        return 0;
+}
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
        unsigned long expired;
@@ -721,9 +795,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
                return 0;
        wb->last_old_flush = jiffies;
-        nr_pages = global_page_state(NR_FILE_DIRTY) +
+        nr_pages = get_nr_dirty_pages();
-                        global_page_state(NR_UNSTABLE_NFS) +
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        if (nr_pages) {
                struct wb_writeback_work work = {
@@ -775,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
+        wrote += wb_check_background_flush(wb);
        clear_bit(BDI_writeback_running, &wb->bdi->state);
        return wrote;
@@ -790,7 +863,7 @@ int bdi_writeback_thread(void *data)
        struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
-        current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+        current->flags |= PF_SWAPWRITE;
        set_freezable();
        wb->last_active = jiffies;
@@ -861,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                __bdi_start_writeback(bdi, nr_pages, false, false);
+                __bdi_start_writeback(bdi, nr_pages, false);
        }
        rcu_read_unlock();
 }
@@ -962,7 +1035,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
-                        if (hlist_unhashed(&inode->i_hash))
+                        if (inode_unhashed(inode))
                                goto out;
                }
                if (inode->i_state & I_FREEING)
@@ -990,7 +1063,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        }
                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_list, &bdi->wb.b_dirty);
+                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
                }
        }
 out:
@@ -1103,9 +1176,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
 */
 void writeback_inodes_sb(struct super_block *sb)
 {
-        return writeback_inodes_sb_nr(sb, global_page_state(NR_FILE_DIRTY) +
+        return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
-                              global_page_state(NR_UNSTABLE_NFS) +
-                              (inodes_stat.nr_inodes - inodes_stat.nr_unused));
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
@@ -1154,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
 */
 void sync_inodes_sb(struct super_block *sb)
 {
@@ -1230,3 +1301,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
        return ret;
 }
 EXPORT_SYMBOL(sync_inode);
+/**
+ * sync_inode_metadata - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust its dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+        struct writeback_control wbc = {
+                .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+                .nr_to_write = 0, /* metadata-only */
+        };
+        return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index ed45a9cf5f3d..78b519c13536 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
 #include <linux/path.h>
 #include <linux/slab.h>
 #include <linux/fs_struct.h>
+#include "internal.h"
+static inline void path_get_longterm(struct path *path)
+{
+        path_get(path);
+        mnt_make_longterm(path->mnt);
+}
+static inline void path_put_longterm(struct path *path)
+{
+        mnt_make_shortterm(path->mnt);
+        path_put(path);
+}
 /*
 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -14,12 +27,14 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
        struct path old_root;
        spin_lock(&fs->lock);
+        write_seqcount_begin(&fs->seq);
        old_root = fs->root;
        fs->root = *path;
-        path_get(path);
+        path_get_longterm(path);
+        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_root.dentry)
-                path_put(&old_root);
+                path_put_longterm(&old_root);
 }
 /*
@@ -31,13 +46,15 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
        struct path old_pwd;
        spin_lock(&fs->lock);
+        write_seqcount_begin(&fs->seq);
        old_pwd = fs->pwd;
        fs->pwd = *path;
-        path_get(path);
+        path_get_longterm(path);
+        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_pwd.dentry)
-                path_put(&old_pwd);
+                path_put_longterm(&old_pwd);
 }
 void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -52,31 +69,33 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                fs = p->fs;
                if (fs) {
                        spin_lock(&fs->lock);
+                        write_seqcount_begin(&fs->seq);
                        if (fs->root.dentry == old_root->dentry
                            && fs->root.mnt == old_root->mnt) {
-                                path_get(new_root);
+                                path_get_longterm(new_root);
                                fs->root = *new_root;
                                count++;
                        }
                        if (fs->pwd.dentry == old_root->dentry
                            && fs->pwd.mnt == old_root->mnt) {
-                                path_get(new_root);
+                                path_get_longterm(new_root);
                                fs->pwd = *new_root;
                                count++;
                        }
+                        write_seqcount_end(&fs->seq);
                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
        while (count--)
-                path_put(old_root);
+                path_put_longterm(old_root);
 }
 void free_fs_struct(struct fs_struct *fs)
 {
-        path_put(&fs->root);
+        path_put_longterm(&fs->root);
-        path_put(&fs->pwd);
+        path_put_longterm(&fs->pwd);
        kmem_cache_free(fs_cachep, fs);
 }
@@ -88,8 +107,10 @@ void exit_fs(struct task_struct *tsk)
                int kill;
                task_lock(tsk);
                spin_lock(&fs->lock);
+                write_seqcount_begin(&fs->seq);
                tsk->fs = NULL;
                kill = !--fs->users;
+                write_seqcount_end(&fs->seq);
                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
@@ -105,8 +126,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
                fs->users = 1;
                fs->in_exec = 0;
                spin_lock_init(&fs->lock);
+                seqcount_init(&fs->seq);
                fs->umask = old->umask;
-                get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
+                spin_lock(&old->lock);
+                fs->root = old->root;
+                path_get_longterm(&fs->root);
+                fs->pwd = old->pwd;
+                path_get_longterm(&fs->pwd);
+                spin_unlock(&old->lock);
        }
        return fs;
 }
@@ -144,6 +172,7 @@ EXPORT_SYMBOL(current_umask);
 struct fs_struct init_fs = {
        .users          = 1,
        .lock           = __SPIN_LOCK_UNLOCKED(init_fs.lock),
+        .seq            = SEQCNT_ZERO,
        .umask          = 0022,
 };
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index b9f34eaede09..48a18f184d50 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
                object->n_ops++;
                object->n_exclusive++;  /* reads and writes must wait */
-                if (object->n_ops > 0) {
+                if (object->n_ops > 1) {
                        atomic_inc(&op->usage);
                        list_add_tail(&op->pend_link, &object->pending_ops);
                        fscache_stat(&fscache_n_op_pend);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 3773fd63d2f9..85542a7daf40 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -179,23 +179,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
 static const struct file_operations fuse_ctl_abort_ops = {
        .open = nonseekable_open,
        .write = fuse_conn_abort_write,
+        .llseek = no_llseek,
 };
 static const struct file_operations fuse_ctl_waiting_ops = {
        .open = nonseekable_open,
        .read = fuse_conn_waiting_read,
+        .llseek = no_llseek,
 };
 static const struct file_operations fuse_conn_max_background_ops = {
        .open = nonseekable_open,
        .read = fuse_conn_max_background_read,
        .write = fuse_conn_max_background_write,
+        .llseek = no_llseek,
 };
 static const struct file_operations fuse_conn_congestion_threshold_ops = {
        .open = nonseekable_open,
        .read = fuse_conn_congestion_threshold_read,
        .write = fuse_conn_congestion_threshold_write,
+        .llseek = no_llseek,
 };
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
@@ -218,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
        if (!inode)
                return NULL;
+        inode->i_ino = get_next_ino();
        inode->i_mode = mode;
        inode->i_uid = fc->user_id;
        inode->i_gid = fc->group_id;
@@ -317,12 +322,10 @@ static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
-                        const char *dev_name, void *raw_data,
+                        int flags, const char *dev_name, void *raw_data)
-                        struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, raw_data,
+        return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
-                                fuse_ctl_fill_super, mnt);
 }
 static void fuse_ctl_kill_sb(struct super_block *sb)
@@ -341,7 +344,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb)
 static struct file_system_type fuse_ctl_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fusectl",
-        .get_sb         = fuse_ctl_get_sb,
+        .mount          = fuse_ctl_mount,
        .kill_sb        = fuse_ctl_kill_sb,
 };
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e1f8171278bd..3e87cce5837d 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -182,6 +182,7 @@ static const struct file_operations cuse_frontend_fops = {
        .unlocked_ioctl         = cuse_file_ioctl,
        .compat_ioctl           = cuse_file_compat_ioctl,
        .poll                   = fuse_file_poll,
+        .llseek         = noop_llseek,
 };
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cde755cca564..cf8d28d1fbad 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,6 +251,20 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+                       u64 nodeid, u64 nlookup)
+{
+        forget->forget_one.nodeid = nodeid;
+        forget->forget_one.nlookup = nlookup;
+        spin_lock(&fc->lock);
+        fc->forget_list_tail->next = forget;
+        fc->forget_list_tail = forget;
+        wake_up(&fc->waitq);
+        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+        spin_unlock(&fc->lock);
+}
 static void flush_bg_queue(struct fuse_conn *fc)
 {
        while (fc->active_background < fc->max_background &&
@@ -438,12 +452,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
-{
-        req->isreply = 0;
-        fuse_request_send_nowait(fc, req);
-}
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
@@ -809,11 +817,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
        int err;
        struct page *page = *pagep;
-        if (page && zeroing && count < PAGE_SIZE) {
+        if (page && zeroing && count < PAGE_SIZE)
-                void *mapaddr = kmap_atomic(page, KM_USER1);
+                clear_highpage(page);
-                memset(mapaddr, 0, PAGE_SIZE);
-                kunmap_atomic(mapaddr, KM_USER1);
-        }
        while (count) {
                if (cs->write && cs->pipebufs && page) {
                        return fuse_ref_page(cs, page, offset, count);
@@ -830,10 +836,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
                        }
                }
                if (page) {
-                        void *mapaddr = kmap_atomic(page, KM_USER1);
+                        void *mapaddr = kmap_atomic(page, KM_USER0);
                        void *buf = mapaddr + offset;
                        offset += fuse_copy_do(cs, &buf, &count);
-                        kunmap_atomic(mapaddr, KM_USER1);
+                        kunmap_atomic(mapaddr, KM_USER0);
                } else
                        offset += fuse_copy_do(cs, NULL, &count);
        }
@@ -898,9 +904,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
        return err;
 }
+static int forget_pending(struct fuse_conn *fc)
+{
+        return fc->forget_list_head.next != NULL;
+}
 static int request_pending(struct fuse_conn *fc)
 {
-        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
+                forget_pending(fc);
 }
 /* Wait until a request is available on the pending list */
@@ -962,6 +974,120 @@ __releases(fc->lock)
        return err ? err : reqsize;
 }
+static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+                                               unsigned max,
+                                               unsigned *countp)
+{
+        struct fuse_forget_link *head = fc->forget_list_head.next;
+        struct fuse_forget_link **newhead = &head;
+        unsigned count;
+        for (count = 0; *newhead != NULL && count < max; count++)
+                newhead = &(*newhead)->next;
+        fc->forget_list_head.next = *newhead;
+        *newhead = NULL;
+        if (fc->forget_list_head.next == NULL)
+                fc->forget_list_tail = &fc->forget_list_head;
+        if (countp != NULL)
+                *countp = count;
+        return head;
+}
+static int fuse_read_single_forget(struct fuse_conn *fc,
+                                   struct fuse_copy_state *cs,
+                                   size_t nbytes)
+__releases(fc->lock)
+{
+        int err;
+        struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+        struct fuse_forget_in arg = {
+                .nlookup = forget->forget_one.nlookup,
+        };
+        struct fuse_in_header ih = {
+                .opcode = FUSE_FORGET,
+                .nodeid = forget->forget_one.nodeid,
+                .unique = fuse_get_unique(fc),
+                .len = sizeof(ih) + sizeof(arg),
+        };
+        spin_unlock(&fc->lock);
+        kfree(forget);
+        if (nbytes < ih.len)
+                return -EINVAL;
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
+        fuse_copy_finish(cs);
+        if (err)
+                return err;
+        return ih.len;
+}
+static int fuse_read_batch_forget(struct fuse_conn *fc,
+                                   struct fuse_copy_state *cs, size_t nbytes)
+__releases(fc->lock)
+{
+        int err;
+        unsigned max_forgets;
+        unsigned count;
+        struct fuse_forget_link *head;
+        struct fuse_batch_forget_in arg = { .count = 0 };
+        struct fuse_in_header ih = {
+                .opcode = FUSE_BATCH_FORGET,
+                .unique = fuse_get_unique(fc),
+                .len = sizeof(ih) + sizeof(arg),
+        };
+        if (nbytes < ih.len) {
+                spin_unlock(&fc->lock);
+                return -EINVAL;
+        }
+        max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+        head = dequeue_forget(fc, max_forgets, &count);
+        spin_unlock(&fc->lock);
+        arg.count = count;
+        ih.len += count * sizeof(struct fuse_forget_one);
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
+        while (head) {
+                struct fuse_forget_link *forget = head;
+                if (!err) {
+                        err = fuse_copy_one(cs, &forget->forget_one,
+                                            sizeof(forget->forget_one));
+                }
+                head = forget->next;
+                kfree(forget);
+        }
+        fuse_copy_finish(cs);
+        if (err)
+                return err;
+        return ih.len;
+}
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+                            size_t nbytes)
+__releases(fc->lock)
+{
+        if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
+                return fuse_read_single_forget(fc, cs, nbytes);
+        else
+                return fuse_read_batch_forget(fc, cs, nbytes);
+}
 /*
 * Read a single request into the userspace filesystem's buffer.  This
 * function waits until a request is available, then removes it from
@@ -1000,6 +1126,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
                return fuse_read_interrupt(fc, cs, nbytes, req);
        }
+        if (forget_pending(fc)) {
+                if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
+                        return fuse_read_forget(fc, cs, nbytes);
+                if (fc->forget_batch <= -8)
+                        fc->forget_batch = 16;
+        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
        req->state = FUSE_REQ_READING;
        list_move(&req->list, &fc->io);
@@ -1092,7 +1226,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
        if (!fc)
                return -EPERM;
-        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
@@ -1336,12 +1470,7 @@ out_finish:
 static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-        int i;
+        release_pages(req->pages, req->num_pages, 0);
-        for (i = 0; i < req->num_pages; i++) {
-                struct page *page = req->pages[i];
-                page_cache_release(page);
-        }
 }
 static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
@@ -1633,7 +1762,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
        if (!fc)
                return -EPERM;
-        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
@@ -1777,6 +1906,8 @@ __acquires(fc->lock)
        flush_bg_queue(fc);
        end_requests(fc, &fc->pending);
        end_requests(fc, &fc->processing);
+        while (forget_pending(fc))
+                kfree(dequeue_forget(fc, 1, NULL));
 }
 /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c9627c95482d..bfed8447ed80 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,9 +10,9 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
-#include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #if BITS_PER_LONG >= 64
 static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
@@ -156,8 +156,12 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
 */
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
-        struct inode *inode = entry->d_inode;
+        struct inode *inode;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = entry->d_inode;
        if (inode && is_bad_inode(inode))
                return 0;
        else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -165,7 +169,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                struct fuse_entry_out outarg;
                struct fuse_conn *fc;
                struct fuse_req *req;
-                struct fuse_req *forget_req;
+                struct fuse_forget_link *forget;
                struct dentry *parent;
                u64 attr_version;
@@ -178,8 +182,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (IS_ERR(req))
                        return 0;
-                forget_req = fuse_get_req(fc);
+                forget = fuse_alloc_forget();
-                if (IS_ERR(forget_req)) {
+                if (!forget) {
                        fuse_put_request(fc, req);
                        return 0;
                }
@@ -199,15 +203,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (!err) {
                        struct fuse_inode *fi = get_fuse_inode(inode);
                        if (outarg.nodeid != get_node_id(inode)) {
-                                fuse_send_forget(fc, forget_req,
+                                fuse_queue_forget(fc, forget, outarg.nodeid, 1);
-                                                 outarg.nodeid, 1);
                                return 0;
                        }
                        spin_lock(&fc->lock);
                        fi->nlookup++;
                        spin_unlock(&fc->lock);
                }
-                fuse_put_request(fc, forget_req);
+                kfree(forget);
                if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
                        return 0;
@@ -259,7 +262,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 {
        struct fuse_conn *fc = get_fuse_conn_super(sb);
        struct fuse_req *req;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        u64 attr_version;
        int err;
@@ -273,9 +276,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
        if (IS_ERR(req))
                goto out;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        err = PTR_ERR(forget_req);
+        err = -ENOMEM;
-        if (IS_ERR(forget_req)) {
+        if (!forget) {
                fuse_put_request(fc, req);
                goto out;
        }
@@ -301,13 +304,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
                           attr_version);
        err = -ENOMEM;
        if (!*inode) {
-                fuse_send_forget(fc, forget_req, outarg->nodeid, 1);
+                fuse_queue_forget(fc, forget, outarg->nodeid, 1);
                goto out;
        }
        err = 0;
 out_put_forget:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
 out:
        return err;
 }
@@ -347,7 +350,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
        }
        entry = newent ? newent : entry;
-        entry->d_op = &fuse_dentry_operations;
        if (outarg_valid)
                fuse_change_entry_timeout(entry, &outarg);
        else
@@ -374,7 +376,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        struct inode *inode;
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        struct fuse_create_in inarg;
        struct fuse_open_out outopen;
        struct fuse_entry_out outentry;
@@ -388,9 +390,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (flags & O_DIRECT)
                return -EINVAL;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        if (IS_ERR(forget_req))
+        if (!forget)
-                return PTR_ERR(forget_req);
+                return -ENOMEM;
        req = fuse_get_req(fc);
        err = PTR_ERR(req);
@@ -448,10 +450,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (!inode) {
                flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
                fuse_sync_release(ff, flags);
-                fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
+                fuse_queue_forget(fc, forget, outentry.nodeid, 1);
                return -ENOMEM;
        }
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        d_instantiate(entry, inode);
        fuse_change_entry_timeout(entry, &outentry);
        fuse_invalidate_attr(dir);
@@ -469,7 +471,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 out_put_request:
        fuse_put_request(fc, req);
 out_put_forget_req:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        return err;
 }
@@ -483,12 +485,12 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        struct fuse_entry_out outarg;
        struct inode *inode;
        int err;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        if (IS_ERR(forget_req)) {
+        if (!forget) {
                fuse_put_request(fc, req);
-                return PTR_ERR(forget_req);
+                return -ENOMEM;
        }
        memset(&outarg, 0, sizeof(outarg));
@@ -515,10 +517,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
                          &outarg.attr, entry_attr_timeout(&outarg), 0);
        if (!inode) {
-                fuse_send_forget(fc, forget_req, outarg.nodeid, 1);
+                fuse_queue_forget(fc, forget, outarg.nodeid, 1);
                return -ENOMEM;
        }
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *alias;
@@ -541,7 +543,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        return 0;
 out_put_forget_req:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        return err;
 }
@@ -981,12 +983,15 @@ static int fuse_access(struct inode *inode, int mask)
 * access request is sent.  Execute permission is still checked
 * locally based on file mode.
 */
-static int fuse_permission(struct inode *inode, int mask)
+static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        bool refreshed = false;
        int err = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        if (!fuse_allow_task(fc, current))
                return -EACCES;
@@ -1001,7 +1006,7 @@ static int fuse_permission(struct inode *inode, int mask)
        }
        if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
-                err = generic_permission(inode, mask, NULL);
+                err = generic_permission(inode, mask, flags, NULL);
                /* If permission is denied, try to refresh file
                   attributes.  This is also needed, because the root
@@ -1009,7 +1014,8 @@ static int fuse_permission(struct inode *inode, int mask)
                if (err == -EACCES && !refreshed) {
                        err = fuse_do_getattr(inode, NULL, NULL);
                        if (!err)
-                                err = generic_permission(inode, mask, NULL);
+                                err = generic_permission(inode, mask,
+                                                        flags, NULL);
                }
                /* Note: the opposite of the above test does not
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c8224587123f..95da1bc1c826 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 static const struct file_operations fuse_direct_io_file_operations;
@@ -134,6 +135,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
 void fuse_finish_open(struct inode *inode, struct file *file)
 {
        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = get_fuse_conn(inode);
        if (ff->open_flags & FOPEN_DIRECT_IO)
                file->f_op = &fuse_direct_io_file_operations;
@@ -141,6 +143,15 @@ void fuse_finish_open(struct inode *inode, struct file *file)
                invalidate_inode_pages2(inode->i_mapping);
        if (ff->open_flags & FOPEN_NONSEEKABLE)
                nonseekable_open(inode, file);
+        if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
+                struct fuse_inode *fi = get_fuse_inode(inode);
+                spin_lock(&fc->lock);
+                fi->attr_version = ++fc->attr_version;
+                i_size_write(inode, 0);
+                spin_unlock(&fc->lock);
+                fuse_invalidate_attr(inode);
+        }
 }
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -1618,6 +1629,94 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 }
 /*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit.  Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
+                                     size_t transferred, unsigned count,
+                                     bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+        if (count * sizeof(struct compat_iovec) == transferred) {
+                struct compat_iovec *ciov = src;
+                unsigned i;
+                /*
+                 * With this interface a 32bit server cannot support
+                 * non-compat (i.e. ones coming from 64bit apps) ioctl
+                 * requests
+                 */
+                if (!is_compat)
+                        return -EINVAL;
+                for (i = 0; i < count; i++) {
+                        dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+                        dst[i].iov_len = ciov[i].iov_len;
+                }
+                return 0;
+        }
+#endif
+        if (count * sizeof(struct iovec) != transferred)
+                return -EIO;
+        memcpy(dst, src, transferred);
+        return 0;
+}
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+{
+        size_t n;
+        u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+        for (n = 0; n < count; n++) {
+                if (iov->iov_len > (size_t) max)
+                        return -ENOMEM;
+                max -= iov->iov_len;
+        }
+        return 0;
+}
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+                                 void *src, size_t transferred, unsigned count,
+                                 bool is_compat)
+{
+        unsigned i;
+        struct fuse_ioctl_iovec *fiov = src;
+        if (fc->minor < 16) {
+                return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+                                                 count, is_compat);
+        }
+        if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+                return -EIO;
+        for (i = 0; i < count; i++) {
+                /* Did the server supply an inappropriate value? */
+                if (fiov[i].base != (unsigned long) fiov[i].base ||
+                    fiov[i].len != (unsigned long) fiov[i].len)
+                        return -EIO;
+                dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+                dst[i].iov_len = (size_t) fiov[i].len;
+#ifdef CONFIG_COMPAT
+                if (is_compat &&
+                    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+                     (compat_size_t) dst[i].iov_len != fiov[i].len))
+                        return -EIO;
+#endif
+        }
+        return 0;
+}
+/*
 * For ioctls, there is no generic way to determine how much memory
 * needs to be read and/or written.  Furthermore, ioctls are allowed
 * to dereference the passed pointer, so the parameter requires deep
@@ -1677,18 +1776,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        struct fuse_ioctl_out outarg;
        struct fuse_req *req = NULL;
        struct page **pages = NULL;
-        struct page *iov_page = NULL;
+        struct iovec *iov_page = NULL;
        struct iovec *in_iov = NULL, *out_iov = NULL;
        unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
        size_t in_size, out_size, transferred;
        int err;
+#if BITS_PER_LONG == 32
+        inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+        if (flags & FUSE_IOCTL_COMPAT)
+                inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
        /* assume all the iovs returned by client always fits in a page */
-        BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+        BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
        err = -ENOMEM;
        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
-        iov_page = alloc_page(GFP_KERNEL);
+        iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
        if (!pages || !iov_page)
                goto out;
@@ -1697,7 +1803,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
         * RETRY from server is not allowed.
         */
        if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
-                struct iovec *iov = page_address(iov_page);
+                struct iovec *iov = iov_page;
                iov->iov_base = (void __user *)arg;
                iov->iov_len = _IOC_SIZE(cmd);
@@ -1778,7 +1884,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        /* did it ask for retry? */
        if (outarg.flags & FUSE_IOCTL_RETRY) {
-                char *vaddr;
+                void *vaddr;
                /* no retry if in restricted mode */
                err = -EIO;
@@ -1798,18 +1904,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
                        goto out;
-                err = -EIO;
-                if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
-                        goto out;
-                /* okay, copy in iovs and retry */
                vaddr = kmap_atomic(pages[0], KM_USER0);
-                memcpy(page_address(iov_page), vaddr, transferred);
+                err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
+                                            transferred, in_iovs + out_iovs,
+                                            (flags & FUSE_IOCTL_COMPAT) != 0);
                kunmap_atomic(vaddr, KM_USER0);
+                if (err)
+                        goto out;
-                in_iov = page_address(iov_page);
+                in_iov = iov_page;
                out_iov = in_iov + in_iovs;
+                err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+                if (err)
+                        goto out;
+                err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+                if (err)
+                        goto out;
                goto retry;
        }
@@ -1821,8 +1934,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 out:
        if (req)
                fuse_put_request(fc, req);
-        if (iov_page)
+        free_page((unsigned long) iov_page);
-                __free_page(iov_page);
        while (num_pages)
                __free_page(pages[--num_pages]);
        kfree(pages);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 57d4a3a0f102..ae5744a2f9e9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -53,6 +53,12 @@ extern struct mutex fuse_mutex;
 extern unsigned max_user_bgreq;
 extern unsigned max_user_congthresh;
+/* One forget request */
+struct fuse_forget_link {
+        struct fuse_forget_one forget_one;
+        struct fuse_forget_link *next;
+};
 /** FUSE inode */
 struct fuse_inode {
        /** Inode data */
@@ -66,7 +72,7 @@ struct fuse_inode {
        u64 nlookup;
        /** The request used for sending the FORGET message */
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        /** Time in jiffies until the file attributes are valid */
        u64 i_time;
@@ -255,7 +261,6 @@ struct fuse_req {
        /** Data for asynchronous requests */
        union {
-                struct fuse_forget_in forget_in;
                struct {
                        struct fuse_release_in in;
                        struct path path;
@@ -369,6 +374,13 @@ struct fuse_conn {
        /** Pending interrupts */
        struct list_head interrupts;
+        /** Queue of pending forgets */
+        struct fuse_forget_link forget_list_head;
+        struct fuse_forget_link *forget_list_tail;
+        /** Batching of FORGET requests (positive indicates FORGET batch) */
+        int forget_batch;
        /** Flag indicating if connection is blocked.  This will be
            the case before the INIT reply is received, and if there
            are too many outstading backgrounds requests */
@@ -543,8 +555,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 /**
 * Send FORGET command
 */
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
-                      u64 nodeid, u64 nlookup);
+                       u64 nodeid, u64 nlookup);
+struct fuse_forget_link *fuse_alloc_forget(void);
 /**
 * Initialize READ or READDIR request
@@ -656,11 +670,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 /**
- * Send a request with no reply
- */
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
-/**
 * Send a request in the background
 */
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index da9e6e11374c..9e3f68cc1bd1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,6 +71,11 @@ struct fuse_mount_data {
        unsigned blksize;
 };
+struct fuse_forget_link *fuse_alloc_forget()
+{
+        return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+}
 static struct inode *fuse_alloc_inode(struct super_block *sb)
 {
        struct inode *inode;
@@ -90,8 +95,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&fi->queued_writes);
        INIT_LIST_HEAD(&fi->writepages);
        init_waitqueue_head(&fi->page_waitq);
-        fi->forget_req = fuse_request_alloc();
+        fi->forget = fuse_alloc_forget();
-        if (!fi->forget_req) {
+        if (!fi->forget) {
                kmem_cache_free(fuse_inode_cachep, inode);
                return NULL;
        }
@@ -99,27 +104,20 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        return inode;
 }
-static void fuse_destroy_inode(struct inode *inode)
+static void fuse_i_callback(struct rcu_head *head)
 {
-        struct fuse_inode *fi = get_fuse_inode(inode);
+        struct inode *inode = container_of(head, struct inode, i_rcu);
-        BUG_ON(!list_empty(&fi->write_files));
+        INIT_LIST_HEAD(&inode->i_dentry);
-        BUG_ON(!list_empty(&fi->queued_writes));
-        if (fi->forget_req)
-                fuse_request_free(fi->forget_req);
        kmem_cache_free(fuse_inode_cachep, inode);
 }
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+static void fuse_destroy_inode(struct inode *inode)
-                      u64 nodeid, u64 nlookup)
 {
-        struct fuse_forget_in *inarg = &req->misc.forget_in;
+        struct fuse_inode *fi = get_fuse_inode(inode);
-        inarg->nlookup = nlookup;
+        BUG_ON(!list_empty(&fi->write_files));
-        req->in.h.opcode = FUSE_FORGET;
+        BUG_ON(!list_empty(&fi->queued_writes));
-        req->in.h.nodeid = nodeid;
+        kfree(fi->forget);
-        req->in.numargs = 1;
+        call_rcu(&inode->i_rcu, fuse_i_callback);
-        req->in.args[0].size = sizeof(struct fuse_forget_in);
-        req->in.args[0].value = inarg;
-        fuse_request_send_noreply(fc, req);
 }
 static void fuse_evict_inode(struct inode *inode)
@@ -129,8 +127,8 @@ static void fuse_evict_inode(struct inode *inode)
        if (inode->i_sb->s_flags & MS_ACTIVE) {
                struct fuse_conn *fc = get_fuse_conn(inode);
                struct fuse_inode *fi = get_fuse_inode(inode);
-                fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup);
+                fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
-                fi->forget_req = NULL;
+                fi->forget = NULL;
        }
 }
@@ -534,6 +532,7 @@ void fuse_conn_init(struct fuse_conn *fc)
        INIT_LIST_HEAD(&fc->interrupts);
        INIT_LIST_HEAD(&fc->bg_queue);
        INIT_LIST_HEAD(&fc->entry);
+        fc->forget_list_tail = &fc->forget_list_head;
        atomic_set(&fc->num_waiting, 0);
        fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
        fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
@@ -618,10 +617,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
                goto out_iput;
        entry = d_obtain_alias(inode);
-        if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) {
+        if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
-                entry->d_op = &fuse_dentry_operations;
                fuse_invalidate_entry_cache(entry);
-        }
        return entry;
@@ -720,10 +717,8 @@ static struct dentry *fuse_get_parent(struct dentry *child)
        }
        parent = d_obtain_alias(inode);
-        if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) {
+        if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
-                parent->d_op = &fuse_dentry_operations;
                fuse_invalidate_entry_cache(parent);
-        }
        return parent;
 }
@@ -990,6 +985,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
                iput(root);
                goto err_put_conn;
        }
+        /* only now - we want root dentry with NULL ->d_op */
+        sb->s_d_op = &fuse_dentry_operations;
        init_req = fuse_request_alloc();
        if (!init_req)
@@ -1041,11 +1038,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        return err;
 }
-static int fuse_get_sb(struct file_system_type *fs_type,
+static struct dentry *fuse_mount(struct file_system_type *fs_type,
                       int flags, const char *dev_name,
-                       void *raw_data, struct vfsmount *mnt)
+                       void *raw_data)
 {
-        return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
+        return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
 }
 static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1065,17 +1062,16 @@ static struct file_system_type fuse_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuse",
        .fs_flags       = FS_HAS_SUBTYPE,
-        .get_sb         = fuse_get_sb,
+        .mount          = fuse_mount,
        .kill_sb        = fuse_kill_sb_anon,
 };
 #ifdef CONFIG_BLOCK
-static int fuse_get_sb_blk(struct file_system_type *fs_type,
+static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
                           int flags, const char *dev_name,
-                           void *raw_data, struct vfsmount *mnt)
+                           void *raw_data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
-                           mnt);
 }
 static void fuse_kill_sb_blk(struct super_block *sb)
@@ -1094,7 +1090,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
 static struct file_system_type fuseblk_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuseblk",
-        .get_sb         = fuse_get_sb_blk,
+        .mount          = fuse_mount_blk,
        .kill_sb        = fuse_kill_sb_blk,
        .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 6bc9e3a5a693..06c48a891832 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -190,14 +190,20 @@ generic_acl_chmod(struct inode *inode)
 }
 int
-generic_check_acl(struct inode *inode, int mask)
+generic_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
-        if (acl) {
+                        return -ECHILD;
-                int error = posix_acl_permission(inode, acl, mask);
+        } else {
-                posix_acl_release(acl);
+                struct posix_acl *acl;
-                return error;
+                acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+                if (acl) {
+                        int error = posix_acl_permission(inode, acl, mask);
+                        posix_acl_release(acl);
+                        return error;
+                }
        }
        return -EAGAIN;
 }
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cc9665522148..c465ae066c62 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
        tristate "GFS2 file system support"
-        depends on EXPERIMENTAL && (64BIT || LBDAF)
+        depends on (64BIT || LBDAF)
        select DLM if GFS2_FS_LOCKING_DLM
        select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
        select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 48171f4c943d..7118f1a780a9 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -75,11 +75,14 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
 * Returns: errno
 */
-int gfs2_check_acl(struct inode *inode, int mask)
+int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index b522b0cb39ea..a93907c8159b 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -16,7 +16,7 @@
 #define GFS2_POSIX_ACL_DEFAULT          "posix_acl_default"
 #define GFS2_ACL_MAX_ENTRIES            25
-extern int gfs2_check_acl(struct inode *inode, int mask);
+extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
 extern const struct xattr_handler gfs2_xattr_system_handler;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 194fe16d8418..4f36f8832b9b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
 #include "glops.h"
-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
-                                   unsigned int from, unsigned int to)
+                            unsigned int from, unsigned int to)
 {
        struct buffer_head *head = page_buffers(page);
        unsigned int bsize = head->b_size;
@@ -615,10 +615,9 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
        int alloc_required;
        int error = 0;
-        struct gfs2_alloc *al;
+        struct gfs2_alloc *al = NULL;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-        unsigned to = from + len;
        struct page *page;
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -663,6 +662,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                rblocks += RES_STATFS + RES_QUOTA;
        if (&ip->i_inode == sdp->sd_rindex)
                rblocks += 2 * RES_STATFS;
+        if (alloc_required)
+                rblocks += gfs2_rg_blocks(al);
        error = gfs2_trans_begin(sdp, rblocks,
                                 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -689,20 +690,18 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        }
 prepare_write:
-        error = block_prepare_write(page, from, to, gfs2_block_map);
+        error = __block_write_begin(page, from, len, gfs2_block_map);
 out:
        if (error == 0)
                return 0;
        page_cache_release(page);
-        /*
+        gfs2_trans_end(sdp);
-         * XXX(truncate): the call below should probably be replaced with
-         * a call to the gfs2-specific truncate blocks helper to actually
-         * release disk blocks..
-         */
        if (pos + len > ip->i_inode.i_size)
-                truncate_setsize(&ip->i_inode, ip->i_inode.i_size);
+                gfs2_trim_blocks(&ip->i_inode);
+        goto out_trans_fail;
 out_endtrans:
        gfs2_trans_end(sdp);
 out_trans_fail:
@@ -802,10 +801,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        page_cache_release(page);
        if (copied) {
-                if (inode->i_size < to) {
+                if (inode->i_size < to)
                        i_size_write(inode, to);
-                        ip->i_disksize = inode->i_size;
-                }
                gfs2_dinode_out(ip, di);
                mark_inode_dirty(inode);
        }
@@ -876,8 +873,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
        if (ret > 0) {
-                if (inode->i_size > ip->i_disksize)
-                        ip->i_disksize = inode->i_size;
                gfs2_dinode_out(ip, dibh->b_data);
                mark_inode_dirty(inode);
        }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6f482809d1a3..3c4039d5eef1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
 * @ip: the inode
 * @dibh: the dinode buffer
 * @block: the block number that was allocated
- * @private: any locked page held by the caller process
+ * @page: The (optional) page. This is looked up if @page is NULL
 *
 * Returns: errno
 */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 /**
 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 * @ip: The GFS2 inode to unstuff
- * @unstuffer: the routine that handles unstuffing a non-zero length file
+ * @page: The (optional) page. This is looked up if the @page is NULL
- * @private: private data for the unstuffer
 *
 * This routine unstuffs a dinode and returns it to a "normal" state such
 * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        if (error)
                goto out;
-        if (ip->i_disksize) {
+        if (i_size_read(&ip->i_inode)) {
                /* Get a free block, fill it with the stuffed data,
                   and write it out to disk */
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        di = (struct gfs2_dinode *)dibh->b_data;
        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
-        if (ip->i_disksize) {
+        if (i_size_read(&ip->i_inode)) {
                *(__be64 *)(di + 1) = cpu_to_be64(block);
                gfs2_add_inode_blocks(&ip->i_inode, 1);
                di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -764,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        int metadata;
        unsigned int revokes = 0;
        int x;
-        int error;
+        int error = 0;
        if (!*top)
                sm->sm_first = 0;
@@ -781,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
-        error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+        if (ip != GFS2_I(sdp->sd_rindex))
+                error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+        else if (!sdp->sd_rgrps)
+                error = gfs2_ri_update(ip);
        if (error)
                return error;
@@ -880,88 +883,20 @@ out_rg_gunlock:
 out_rlist:
        gfs2_rlist_free(&rlist);
 out:
-        gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+        if (ip != GFS2_I(sdp->sd_rindex))
+                gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
        return error;
 }
 /**
- * do_grow - Make a file look bigger than it is
- * @ip: the inode
- * @size: the size to set the file to
- *
- * Called with an exclusive lock on @ip.
- *
- * Returns: errno
- */
-static int do_grow(struct gfs2_inode *ip, u64 size)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al;
-        struct buffer_head *dibh;
-        int error;
-        al = gfs2_alloc_get(ip);
-        if (!al)
-                return -ENOMEM;
-        error = gfs2_quota_lock_check(ip);
-        if (error)
-                goto out;
-        al->al_requested = sdp->sd_max_height + RES_DATA;
-        error = gfs2_inplace_reserve(ip);
-        if (error)
-                goto out_gunlock_q;
-        error = gfs2_trans_begin(sdp,
-                        sdp->sd_max_height + al->al_rgd->rd_length +
-                        RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
-        if (error)
-                goto out_ipres;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                goto out_end_trans;
-        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
-                if (gfs2_is_stuffed(ip)) {
-                        error = gfs2_unstuff_dinode(ip, NULL);
-                        if (error)
-                                goto out_brelse;
-                }
-        }
-        ip->i_disksize = size;
-        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-out_brelse:
-        brelse(dibh);
-out_end_trans:
-        gfs2_trans_end(sdp);
-out_ipres:
-        gfs2_inplace_release(ip);
-out_gunlock_q:
-        gfs2_quota_unlock(ip);
-out:
-        gfs2_alloc_put(ip);
-        return error;
-}
-/**
 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
 *
 * This is partly borrowed from ext3.
 */
-static int gfs2_block_truncate_page(struct address_space *mapping)
+static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 {
        struct inode *inode = mapping->host;
        struct gfs2_inode *ip = GFS2_I(inode);
-        loff_t from = inode->i_size;
        unsigned long index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
        unsigned blocksize, iblock, length, pos;
@@ -1023,9 +958,11 @@ unlock:
        return err;
 }
-static int trunc_start(struct gfs2_inode *ip, u64 size)
+static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct address_space *mapping = inode->i_mapping;
        struct buffer_head *dibh;
        int journaled = gfs2_is_jdata(ip);
        int error;
@@ -1039,31 +976,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
        if (error)
                goto out;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        if (gfs2_is_stuffed(ip)) {
-                u64 dsize = size + sizeof(struct gfs2_dinode);
+                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
-                ip->i_disksize = size;
-                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                gfs2_dinode_out(ip, dibh->b_data);
-                if (dsize > dibh->b_size)
-                        dsize = dibh->b_size;
-                gfs2_buffer_clear_tail(dibh, dsize);
-                error = 1;
        } else {
-                if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
+                if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
-                        error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
+                        error = gfs2_block_truncate_page(mapping, newsize);
+                        if (error)
-                if (!error) {
+                                goto out_brelse;
-                        ip->i_disksize = size;
-                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-                        ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
-                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                        gfs2_dinode_out(ip, dibh->b_data);
                }
+                ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
        }
-        brelse(dibh);
+        i_size_write(inode, newsize);
+        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+        gfs2_dinode_out(ip, dibh->b_data);
+        truncate_pagecache(inode, oldsize, newsize);
+out_brelse:
+        brelse(dibh);
 out:
        gfs2_trans_end(sdp);
        return error;
@@ -1123,7 +1055,7 @@ static int trunc_end(struct gfs2_inode *ip)
        if (error)
                goto out;
-        if (!ip->i_disksize) {
+        if (!i_size_read(&ip->i_inode)) {
                ip->i_height = 0;
                ip->i_goal = ip->i_no_addr;
                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1075,154 @@ out:
 /**
 * do_shrink - make a file smaller
- * @ip: the inode
+ * @inode: the inode
- * @size: the size to make the file
+ * @oldsize: the current inode size
- * @truncator: function to truncate the last partial block
+ * @newsize: the size to make the file
 *
- * Called with an exclusive lock on @ip.
+ * Called with an exclusive lock on @inode. The @size must
+ * be equal to or smaller than the current inode size.
 *
 * Returns: errno
 */
-static int do_shrink(struct gfs2_inode *ip, u64 size)
+static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
 {
+        struct gfs2_inode *ip = GFS2_I(inode);
        int error;
-        error = trunc_start(ip, size);
+        error = trunc_start(inode, oldsize, newsize);
        if (error < 0)
                return error;
-        if (error > 0)
+        if (gfs2_is_stuffed(ip))
                return 0;
-        error = trunc_dealloc(ip, size);
+        error = trunc_dealloc(ip, newsize);
-        if (!error)
+        if (error == 0)
                error = trunc_end(ip);
        return error;
 }
-static int do_touch(struct gfs2_inode *ip, u64 size)
+void gfs2_trim_blocks(struct inode *inode)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u64 size = inode->i_size;
+        int ret;
+        ret = do_shrink(inode, size, size);
+        WARN_ON(ret != 0);
+}
+/**
+ * do_grow - Touch and update inode size
+ * @inode: The inode
+ * @size: The new size
+ *
+ * This function updates the timestamps on the inode and
+ * may also increase the size of the inode. This function
+ * must not be called with @size any smaller than the current
+ * inode size.
+ *
+ * Although it is not strictly required to unstuff files here,
+ * earlier versions of GFS2 have a bug in the stuffed file reading
+ * code which will result in a buffer overrun if the size is larger
+ * than the max stuffed file size. In order to prevent this from
+ * occuring, such files are unstuffed, but in other cases we can
+ * just update the inode size directly.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+static int do_grow(struct inode *inode, u64 size)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct buffer_head *dibh;
+        struct gfs2_alloc *al = NULL;
        int error;
-        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (gfs2_is_stuffed(ip) &&
+            (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
+                al = gfs2_alloc_get(ip);
+                if (al == NULL)
+                        return -ENOMEM;
+                error = gfs2_quota_lock_check(ip);
+                if (error)
+                        goto do_grow_alloc_put;
+                al->al_requested = 1;
+                error = gfs2_inplace_reserve(ip);
+                if (error)
+                        goto do_grow_qunlock;
+        }
+        error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
        if (error)
-                return error;
+                goto do_grow_release;
-        down_write(&ip->i_rw_mutex);
+        if (al) {
+                error = gfs2_unstuff_dinode(ip, NULL);
+                if (error)
+                        goto do_end_trans;
+        }
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (error)
-                goto do_touch_out;
+                goto do_end_trans;
+        i_size_write(inode, size);
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
-do_touch_out:
+do_end_trans:
-        up_write(&ip->i_rw_mutex);
        gfs2_trans_end(sdp);
+do_grow_release:
+        if (al) {
+                gfs2_inplace_release(ip);
+do_grow_qunlock:
+                gfs2_quota_unlock(ip);
+do_grow_alloc_put:
+                gfs2_alloc_put(ip);
+        }
        return error;
 }
 /**
- * gfs2_truncatei - make a file a given size
+ * gfs2_setattr_size - make a file a given size
- * @ip: the inode
+ * @inode: the inode
- * @size: the size to make the file
+ * @newsize: the size to make the file
- * @truncator: function to truncate the last partial block
 *
- * The file size can grow, shrink, or stay the same size.
+ * The file size can grow, shrink, or stay the same size. This
+ * is called holding i_mutex and an exclusive glock on the inode
+ * in question.
 *
 * Returns: errno
 */
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+int gfs2_setattr_size(struct inode *inode, u64 newsize)
 {
-        int error;
+        int ret;
+        u64 oldsize;
-        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
+        BUG_ON(!S_ISREG(inode->i_mode));
-                return -EINVAL;
-        if (size > ip->i_disksize)
+        ret = inode_newsize_ok(inode, newsize);
-                error = do_grow(ip, size);
+        if (ret)
-        else if (size < ip->i_disksize)
+                return ret;
-                error = do_shrink(ip, size);
-        else
-                /* update time stamps */
-                error = do_touch(ip, size);
-        return error;
+        oldsize = inode->i_size;
+        if (newsize >= oldsize)
+                return do_grow(inode, newsize);
+        return do_shrink(inode, oldsize, newsize);
 }
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
        int error;
-        error = trunc_dealloc(ip, ip->i_disksize);
+        error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
        if (!error)
                error = trunc_end(ip);
        return error;
@@ -1269,7 +1263,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
        shift = sdp->sd_sb.sb_bsize_shift;
        BUG_ON(gfs2_is_dir(ip));
-        end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
+        end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
        lblock = offset >> shift;
        lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
        if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index a20a5213135a..42fea03e2bd9 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
        }
 }
-int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
-int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
+extern int gfs2_block_map(struct inode *inode, sector_t lblock,
-int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
+                          struct buffer_head *bh, int create);
+extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
+                           u64 *dblock, unsigned *extlen);
-int gfs2_truncatei_resume(struct gfs2_inode *ip);
+extern int gfs2_setattr_size(struct inode *inode, u64 size);
-int gfs2_file_dealloc(struct gfs2_inode *ip);
+extern void gfs2_trim_blocks(struct inode *inode);
-int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
-                              unsigned int len);
+extern int gfs2_file_dealloc(struct gfs2_inode *ip);
+extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+                                     unsigned int len);
 #endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d8..4a456338b873 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -11,6 +11,7 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/namei.h>
 #include <linux/crc32.h>
 #include "gfs2.h"
@@ -34,22 +35,30 @@
 static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct dentry *parent = dget_parent(dentry);
+        struct dentry *parent;
-        struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
+        struct gfs2_sbd *sdp;
-        struct gfs2_inode *dip = GFS2_I(parent->d_inode);
+        struct gfs2_inode *dip;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        struct gfs2_holder d_gh;
        struct gfs2_inode *ip = NULL;
        int error;
        int had_lock = 0;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        parent = dget_parent(dentry);
+        sdp = GFS2_SB(parent->d_inode);
+        dip = GFS2_I(parent->d_inode);
+        inode = dentry->d_inode;
        if (inode) {
                if (is_bad_inode(inode))
                        goto invalid;
                ip = GFS2_I(inode);
        }
-        if (sdp->sd_args.ar_localcaching)
+        if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
                goto valid;
        had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
@@ -100,13 +109,14 @@ fail:
        return 0;
 }
-static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *str)
 {
        str->hash = gfs2_disk_hash(str->name, str->len);
        return 0;
 }
-static int gfs2_dentry_delete(struct dentry *dentry)
+static int gfs2_dentry_delete(const struct dentry *dentry)
 {
        struct gfs2_inode *ginode;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b9dd88a78dd4..5c356d09c321 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+struct qstr gfs2_qdot __read_mostly;
+struct qstr gfs2_qdotdot __read_mostly;
 typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
                            u64 leaf_no, void *data);
 typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-        if (ip->i_disksize < offset + size)
+        if (ip->i_inode.i_size < offset + size)
-                ip->i_disksize = offset + size;
+                i_size_write(&ip->i_inode, offset + size);
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(ip, dibh->b_data);
@@ -225,8 +228,8 @@ out:
        if (error)
                return error;
-        if (ip->i_disksize < offset + copied)
+        if (ip->i_inode.i_size < offset + copied)
-                ip->i_disksize = offset + copied;
+                i_size_write(&ip->i_inode, offset + copied);
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
        unsigned int o;
        int copied = 0;
        int error = 0;
+        u64 disksize = i_size_read(&ip->i_inode);
-        if (offset >= ip->i_disksize)
+        if (offset >= disksize)
                return 0;
-        if (offset + size > ip->i_disksize)
+        if (offset + size > disksize)
-                size = ip->i_disksize - offset;
+                size = disksize - offset;
        if (!size)
                return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
                unsigned hsize = 1 << ip->i_depth;
                unsigned index;
                u64 ln;
-                if (hsize * sizeof(u64) != ip->i_disksize) {
+                if (hsize * sizeof(u64) != i_size_read(inode)) {
                        gfs2_consist_inode(ip);
                        return ERR_PTR(-EIO);
                }
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
        for (x = sdp->sd_hash_ptrs; x--; lp++)
                *lp = cpu_to_be64(bn);
-        dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
+        i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
        gfs2_add_inode_blocks(&dip->i_inode, 1);
        dip->i_diskflags |= GFS2_DIF_EXHASH;
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        u64 *buf;
        u64 *from, *to;
        u64 block;
+        u64 disksize = i_size_read(&dip->i_inode);
        int x;
        int error = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_disksize) {
+        if (hsize * sizeof(u64) != disksize) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        if (!buf)
                return -ENOMEM;
-        for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
+        for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
                error = gfs2_dir_read_data(dip, (char *)buf,
                                            block * sdp->sd_hash_bsize,
                                            sdp->sd_hash_bsize, 1);
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        unsigned depth = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_disksize) {
+        if (hsize * sizeof(u64) != i_size_read(inode)) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
        int error = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_disksize) {
+        if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3be..a98f644bd3df 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
 struct gfs2_inode;
 struct gfs2_inum;
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename);
+extern struct inode *gfs2_dir_search(struct inode *dir,
-int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+                                     const struct qstr *filename);
-                   const struct gfs2_inode *ip);
+extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
-int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+                          const struct gfs2_inode *ip);
-                 const struct gfs2_inode *ip, unsigned int type);
+extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+                        const struct gfs2_inode *ip, unsigned int type);
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
-                  filldir_t filldir);
+extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+                         filldir_t filldir);
-                   const struct gfs2_inode *nip, unsigned int new_type);
+extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+                          const struct gfs2_inode *nip, unsigned int new_type);
-int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
-int gfs2_diradd_alloc_required(struct inode *dir,
+extern int gfs2_diradd_alloc_required(struct inode *dir,
-                               const struct qstr *filename);
+                                      const struct qstr *filename);
-int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
-                            struct buffer_head **bhp);
+                                   struct buffer_head **bhp);
 static inline u32 gfs2_disk_hash(const char *data, int len)
 {
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
        memcpy(dent + 1, name->name, name->len);
 }
+extern struct qstr gfs2_qdot;
+extern struct qstr gfs2_qdotdot;
 #endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8ad..9023db8184f9 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,29 +126,14 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 static struct dentry *gfs2_get_parent(struct dentry *child)
 {
-        struct qstr dotdot;
+        return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
-        struct dentry *dentry;
-        /*
-         * XXX(hch): it would be a good idea to keep this around as a
-         *           static variable.
-         */
-        gfs2_str2qstr(&dotdot, "..");
-        dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
-        if (!IS_ERR(dentry))
-                dentry->d_op = &gfs2_dops;
-        return dentry;
 }
 static struct dentry *gfs2_get_dentry(struct super_block *sb,
                                      struct gfs2_inum_host *inum)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_holder i_gh;
        struct inode *inode;
-        struct dentry *dentry;
-        int error;
        inode = gfs2_ilookup(sb, inum->no_addr);
        if (inode) {
@@ -159,52 +144,13 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
                goto out_inode;
        }
-        error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+        inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
-                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+                                    GFS2_BLKST_DINODE);
-        if (error)
+        if (IS_ERR(inode))
-                return ERR_PTR(error);
+                return ERR_CAST(inode);
-        error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
-        if (error)
-                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
-        if (IS_ERR(inode)) {
-                error = PTR_ERR(inode);
-                goto fail;
-        }
-        error = gfs2_inode_refresh(GFS2_I(inode));
-        if (error) {
-                iput(inode);
-                goto fail;
-        }
-        /* Pick up the works we bypass in gfs2_inode_lookup */
-        if (inode->i_state & I_NEW) 
-                gfs2_set_iop(inode);
-        if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
-                iput(inode);
-                goto fail;
-        }
-        error = -EIO;
-        if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
-                iput(inode);
-                goto fail;
-        }
-        gfs2_glock_dq_uninit(&i_gh);
 out_inode:
-        dentry = d_obtain_alias(inode);
+        return d_obtain_alias(inode);
-        if (!IS_ERR(dentry))
-                dentry->d_op = &gfs2_dops;
-        return dentry;
-fail:
-        gfs2_glock_dq_uninit(&i_gh);
-        return ERR_PTR(error);
 }
 static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4edd662c8232..7cfdcb913363 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
@@ -241,7 +243,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
            !capable(CAP_LINUX_IMMUTABLE))
                goto out;
        if (!IS_IMMUTABLE(inode)) {
-                error = gfs2_permission(inode, MAY_WRITE);
+                error = gfs2_permission(inode, MAY_WRITE, 0);
                if (error)
                        goto out;
        }
@@ -382,8 +384,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        rblocks = RES_DINODE + ind_blocks;
        if (gfs2_is_jdata(ip))
                rblocks += data_blocks ? data_blocks : 1;
-        if (ind_blocks || data_blocks)
+        if (ind_blocks || data_blocks) {
                rblocks += RES_STATFS + RES_QUOTA;
+                rblocks += gfs2_rg_blocks(al);
+        }
        ret = gfs2_trans_begin(sdp, rblocks, 0);
        if (ret)
                goto out_trans_fail;
@@ -491,7 +495,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
                        goto fail;
                if (!(file->f_flags & O_LARGEFILE) &&
-                    ip->i_disksize > MAX_NON_LFS) {
+                    i_size_read(inode) > MAX_NON_LFS) {
                        error = -EOVERFLOW;
                        goto fail_gunlock;
                }
@@ -608,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
+static void empty_write_end(struct page *page, unsigned from,
+                           unsigned to)
+{
+        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        page_zero_new_buffers(page, from, to);
+        flush_dcache_page(page);
+        mark_page_accessed(page);
+        if (!gfs2_is_writeback(ip))
+                gfs2_page_add_databufs(ip, page, from, to);
+        block_commit_write(page, from, to);
+}
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+        unsigned start, end, next;
+        struct buffer_head *bh, *head;
+        int error;
+        if (!page_has_buffers(page)) {
+                error = __block_write_begin(page, from, to - from, gfs2_block_map);
+                if (unlikely(error))
+                        return error;
+                empty_write_end(page, from, to);
+                return 0;
+        }
+        bh = head = page_buffers(page);
+        next = end = 0;
+        while (next < from) {
+                next += bh->b_size;
+                bh = bh->b_this_page;
+        }
+        start = next;
+        do {
+                next += bh->b_size;
+                if (buffer_mapped(bh)) {
+                        if (end) {
+                                error = __block_write_begin(page, start, end - start,
+                                                            gfs2_block_map);
+                                if (unlikely(error))
+                                        return error;
+                                empty_write_end(page, start, end);
+                                end = 0;
+                        }
+                        start = next;
+                }
+                else
+                        end = next;
+                bh = bh->b_this_page;
+        } while (next < to);
+        if (end) {
+                error = __block_write_begin(page, start, end - start, gfs2_block_map);
+                if (unlikely(error))
+                        return error;
+                empty_write_end(page, start, end);
+        }
+        return 0;
+}
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+                           int mode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct buffer_head *dibh;
+        int error;
+        u64 start = offset >> PAGE_CACHE_SHIFT;
+        unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+        u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+        pgoff_t curr;
+        struct page *page;
+        unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+        unsigned int from, to;
+        if (!end_offset)
+                end_offset = PAGE_CACHE_SIZE;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (unlikely(error))
+                goto out;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        if (gfs2_is_stuffed(ip)) {
+                error = gfs2_unstuff_dinode(ip, NULL);
+                if (unlikely(error))
+                        goto out;
+        }
+        curr = start;
+        offset = start << PAGE_CACHE_SHIFT;
+        from = start_offset;
+        to = PAGE_CACHE_SIZE;
+        while (curr <= end) {
+                page = grab_cache_page_write_begin(inode->i_mapping, curr,
+                                                   AOP_FLAG_NOFS);
+                if (unlikely(!page)) {
+                        error = -ENOMEM;
+                        goto out;
+                }
+                if (curr == end)
+                        to = end_offset;
+                error = write_empty_blocks(page, from, to);
+                if (!error && offset + to > inode->i_size &&
+                    !(mode & FALLOC_FL_KEEP_SIZE)) {
+                        i_size_write(inode, offset + to);
+                }
+                unlock_page(page);
+                page_cache_release(page);
+                if (error)
+                        goto out;
+                curr++;
+                offset += PAGE_CACHE_SIZE;
+                from = 0;
+        }
+        gfs2_dinode_out(ip, dibh->b_data);
+        mark_inode_dirty(inode);
+        brelse(dibh);
+out:
+        return error;
+}
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+                            unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+        unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+        for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+                max_data -= tmp;
+        }
+        /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+           so it might end up with fewer data blocks */
+        if (max_data <= *data_blocks)
+                return;
+        *data_blocks = max_data;
+        *ind_blocks = max_blocks - max_data;
+        *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+        if (*len > max) {
+                *len = max;
+                gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+        }
+}
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
+                           loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *ip = GFS2_I(inode);
+        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+        loff_t bytes, max_bytes;
+        struct gfs2_alloc *al;
+        int error;
+        loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+        next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
+        offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+                 sdp->sd_sb.sb_bsize_shift;
+        len = next - offset;
+        bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+        if (!bytes)
+                bytes = UINT_MAX;
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+        error = gfs2_glock_nq(&ip->i_gh);
+        if (unlikely(error))
+                goto out_uninit;
+        if (!gfs2_write_alloc_required(ip, offset, len))
+                goto out_unlock;
+        while (len > 0) {
+                if (len < bytes)
+                        bytes = len;
+                al = gfs2_alloc_get(ip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_unlock;
+                }
+                error = gfs2_quota_lock_check(ip);
+                if (error)
+                        goto out_alloc_put;
+retry:
+                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+                al->al_requested = data_blocks + ind_blocks;
+                error = gfs2_inplace_reserve(ip);
+                if (error) {
+                        if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+                                bytes >>= 1;
+                                goto retry;
+                        }
+                        goto out_qunlock;
+                }
+                max_bytes = bytes;
+                calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+                al->al_requested = data_blocks + ind_blocks;
+                rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+                          RES_RG_HDR + gfs2_rg_blocks(al);
+                if (gfs2_is_jdata(ip))
+                        rblocks += data_blocks ? data_blocks : 1;
+                error = gfs2_trans_begin(sdp, rblocks,
+                                         PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+                if (error)
+                        goto out_trans_fail;
+                error = fallocate_chunk(inode, offset, max_bytes, mode);
+                gfs2_trans_end(sdp);
+                if (error)
+                        goto out_trans_fail;
+                len -= max_bytes;
+                offset += max_bytes;
+                gfs2_inplace_release(ip);
+                gfs2_quota_unlock(ip);
+                gfs2_alloc_put(ip);
+        }
+        goto out_unlock;
+out_trans_fail:
+        gfs2_inplace_release(ip);
+out_qunlock:
+        gfs2_quota_unlock(ip);
+out_alloc_put:
+        gfs2_alloc_put(ip);
+out_unlock:
+        gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+        gfs2_holder_uninit(&ip->i_gh);
+        return error;
+}
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 /**
@@ -620,6 +878,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 * cluster; until we do, disable leases (by just returning -EINVAL),
 * unless the administrator has requested purely local locking.
 *
+ * Locking: called under lock_flocks
+ *
 * Returns: errno
 */
@@ -761,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .setlease       = gfs2_setlease,
+        .fallocate      = gfs2_fallocate,
 };
 const struct file_operations gfs2_dir_fops = {
@@ -771,6 +1032,7 @@ const struct file_operations gfs2_dir_fops = {
        .fsync          = gfs2_fsync,
        .lock           = gfs2_lock,
        .flock          = gfs2_flock,
+        .llseek         = default_llseek,
 };
 #endif /* CONFIG_GFS2_FS_LOCKING_DLM */
@@ -789,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .setlease       = generic_setlease,
+        .fallocate      = gfs2_fallocate,
 };
 const struct file_operations gfs2_dir_fops_nolock = {
@@ -797,5 +1060,6 @@ const struct file_operations gfs2_dir_fops_nolock = {
        .open           = gfs2_open,
        .release        = gfs2_close,
        .fsync          = gfs2_fsync,
+        .llseek         = default_llseek,
 };
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9adf8f924e08..08a8beb152e6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
                else
                        gfs2_glock_put_nolock(gl);
        }
+        if (held1 && held2 && list_empty(&gl->gl_holders))
+                clear_bit(GLF_QUEUED, &gl->gl_flags);
        gl->gl_state = new_state;
        gl->gl_tchange = jiffies;
@@ -539,21 +541,6 @@ out_locked:
        spin_unlock(&gl->gl_spin);
 }
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                 unsigned int req_state,
-                                 unsigned int flags)
-{
-        int ret = LM_OUT_ERROR;
-        if (!sdp->sd_lockstruct.ls_ops->lm_lock)
-                return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
-                                                         req_state, flags);
-        return ret;
-}
 /**
 * do_xmote - Calls the DLM to change the state of a lock
 * @gl: The lock state
@@ -573,13 +560,14 @@ __acquires(&gl->gl_spin)
        lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
                      LM_FLAG_PRIORITY);
-        BUG_ON(gl->gl_state == target);
+        GLOCK_BUG_ON(gl, gl->gl_state == target);
-        BUG_ON(gl->gl_state == gl->gl_target);
+        GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
        if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
            glops->go_inval) {
                set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
                do_error(gl, 0); /* Fail queued try locks */
        }
+        gl->gl_req = target;
        spin_unlock(&gl->gl_spin);
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
@@ -592,15 +580,17 @@ __acquires(&gl->gl_spin)
            gl->gl_state == LM_ST_DEFERRED) &&
            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
                lck_flags |= LM_FLAG_TRY_1CB;
-        ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
-        if (!(ret & LM_OUT_ASYNC)) {
+        if (sdp->sd_lockstruct.ls_ops->lm_lock) {
-                finish_xmote(gl, ret);
+                /* lock_dlm */
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+                GLOCK_BUG_ON(gl, ret);
+        } else { /* lock_nolock */
+                finish_xmote(gl, target);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
-        } else {
-                GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
        }
        spin_lock(&gl->gl_spin);
 }
@@ -684,21 +674,20 @@ static void delete_work_func(struct work_struct *work)
 {
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_inode *ip = NULL;
+        struct gfs2_inode *ip;
        struct inode *inode;
-        u64 no_addr = 0;
+        u64 no_addr = gl->gl_name.ln_number;
+        ip = gl->gl_object;
+        /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
-        spin_lock(&gl->gl_spin);
-        ip = (struct gfs2_inode *)gl->gl_object;
        if (ip)
-                no_addr = ip->i_no_addr;
-        spin_unlock(&gl->gl_spin);
-        if (ip) {
                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
-                if (inode) {
+        else
-                        d_prune_aliases(inode);
+                inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
-                        iput(inode);
+        if (inode && !IS_ERR(inode)) {
-                }
+                d_prune_aliases(inode);
+                iput(inode);
        }
        gfs2_glock_put(gl);
 }
@@ -950,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
        if (seq) {
                struct gfs2_glock_iter *gi = seq->private;
                vsprintf(gi->string, fmt, args);
                seq_printf(seq, gi->string);
        } else {
-                printk(KERN_ERR " ");
+                vaf.fmt = fmt;
-                vprintk(fmt, args);
+                vaf.va = &args;
+                printk(KERN_ERR " %pV", &vaf);
        }
        va_end(args);
 }
@@ -1012,6 +1006,7 @@ fail:
                if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
                        insert_pt = &gh2->gh_list;
        }
+        set_bit(GLF_QUEUED, &gl->gl_flags);
        if (likely(insert_pt == NULL)) {
                list_add_tail(&gh->gh_list, &gl->gl_holders);
                if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1305,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
        gfs2_glock_hold(gl);
        holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
-        if (time_before(now, holdtime))
+        if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
-                delay = holdtime - now;
+                if (time_before(now, holdtime))
-        if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+                        delay = holdtime - now;
-                delay = gl->gl_ops->go_min_hold_time;
+                if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+                        delay = gl->gl_ops->go_min_hold_time;
+        }
        spin_lock(&gl->gl_spin);
        handle_callback(gl, state, delay);
@@ -1357,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
 * @gl: Pointer to the glock
 * @ret: The return value from the dlm
 *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
 */
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+        spin_lock(&gl->gl_spin);
        gl->gl_reply = ret;
        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-                spin_lock(&gl->gl_spin);
                if (gfs2_should_freeze(gl)) {
                        set_bit(GLF_FROZEN, &gl->gl_flags);
                        spin_unlock(&gl->gl_spin);
                        return;
                }
-                spin_unlock(&gl->gl_spin);
        }
+        spin_unlock(&gl->gl_spin);
        set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+        smp_wmb();
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                gfs2_glock_put(gl);
@@ -1512,7 +1513,7 @@ static void clear_glock(struct gfs2_glock *gl)
        spin_unlock(&lru_lock);
        spin_lock(&gl->gl_spin);
-        if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+        if (gl->gl_state != LM_ST_UNLOCKED)
                handle_callback(gl, LM_ST_UNLOCKED, 0);
        spin_unlock(&gl->gl_spin);
        gfs2_glock_hold(gl);
@@ -1622,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
        struct task_struct *gh_owner = NULL;
-        char buffer[KSYM_SYMBOL_LEN];
        char flags_buf[32];
-        sprint_symbol(buffer, gh->gh_ip);
        if (gh->gh_owner_pid)
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
+        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
-                  state2str(gh->gh_state),
+                       state2str(gh->gh_state),
-                  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+                       hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                  gh->gh_error, 
+                       gh->gh_error,
-                  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+                       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                  gh_owner ? gh_owner->comm : "(ended)", buffer);
+                       gh_owner ? gh_owner->comm : "(ended)",
+                       (void *)gh->gh_ip);
        return 0;
 }
@@ -1660,6 +1660,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
                *p++ = 'I';
        if (test_bit(GLF_FROZEN, gflags))
                *p++ = 'F';
+        if (test_bit(GLF_QUEUED, gflags))
+                *p++ = 'q';
        *p = 0;
        return buf;
 }
@@ -1776,10 +1778,13 @@ int __init gfs2_glock_init(void)
        }
 #endif
-        glock_workqueue = create_workqueue("glock_workqueue");
+        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
+                                          WQ_HIGHPRI | WQ_FREEZEABLE, 0);
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
-        gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+                                                WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                                0);
        if (IS_ERR(gfs2_delete_workqueue)) {
                destroy_workqueue(glock_workqueue);
                return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b156..691851ceb615 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
 #define GL_ASYNC                0x00000040
 #define GL_EXACT                0x00000080
 #define GL_SKIP                 0x00000100
-#define GL_ATIME                0x00000200
 #define GL_NOCACHE              0x00000400
  
 /*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
 *
 * LM_OUT_ST_MASK
 * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
 * LM_OUT_CANCELED
 * The lock request was canceled.
 *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
 */
 #define LM_OUT_ST_MASK          0x00000003
 #define LM_OUT_CANCELED         0x00000008
-#define LM_OUT_ASYNC            0x00000080
+#define LM_OUT_ERROR            0x00000004
-#define LM_OUT_ERROR            0x00000100
 /*
 * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
-        unsigned int (*lm_lock) (struct gfs2_glock *gl,
+        int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
-                                 unsigned int req_state, unsigned int flags);
+                        unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
        const match_table_t *lm_tokens;
 };
-#define LM_FLAG_TRY             0x00000001
-#define LM_FLAG_TRY_1CB         0x00000002
-#define LM_FLAG_NOEXP           0x00000004
-#define LM_FLAG_ANY             0x00000008
-#define LM_FLAG_PRIORITY        0x00000010
-#define GL_ASYNC                0x00000040
-#define GL_EXACT                0x00000080
-#define GL_SKIP                 0x00000100
-#define GL_NOCACHE              0x00000400
-#define GLR_TRYFAILED           13
 extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -212,10 +194,12 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+__attribute__ ((format(printf, 2, 3)))
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
- * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
 * @gl: the glock
 * @state: the state we're requesting
 * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb690..263561bf1a50 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
        const struct gfs2_inode *ip = gl->gl_object;
        if (ip == NULL)
                return 0;
-        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
+        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
                  (unsigned long long)ip->i_no_formal_ino,
                  (unsigned long long)ip->i_no_addr,
                  IF2DT(ip->i_inode.i_mode), ip->i_flags,
                  (unsigned int)ip->i_diskflags,
-                  (unsigned long long)ip->i_inode.i_size,
+                  (unsigned long long)i_size_read(&ip->i_inode));
-                  (unsigned long long)ip->i_disksize);
        return 0;
 }
@@ -326,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-                flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
@@ -453,7 +451,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
        [LM_TYPE_META] = &gfs2_meta_glops,
        [LM_TYPE_INODE] = &gfs2_inode_glops,
        [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
-        [LM_TYPE_NONDISK] = &gfs2_trans_glops,
        [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
        [LM_TYPE_FLOCK] = &gfs2_flock_glops,
        [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fdbf4b366fa5..a79790c06275 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
 #define __INCORE_DOT_H__
 #include <linux/fs.h>
+#include <linux/kobject.h>
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
@@ -196,6 +197,7 @@ enum {
        GLF_REPLY_PENDING               = 9,
        GLF_INITIAL                     = 10,
        GLF_FROZEN                      = 11,
+        GLF_QUEUED                      = 12,
 };
 struct gfs2_glock {
@@ -206,12 +208,14 @@ struct gfs2_glock {
        spinlock_t gl_spin;
-        unsigned int gl_state;
+        /* State fields protected by gl_spin */
-        unsigned int gl_target;
+        unsigned int gl_state:2,        /* Current state */
-        unsigned int gl_reply;
+                     gl_target:2,       /* Target state */
+                     gl_demote_state:2, /* State requested by remote node */
+                     gl_req:2,          /* State in last dlm request */
+                     gl_reply:8;        /* Last reply from the dlm */
        unsigned int gl_hash;
-        unsigned int gl_req;
-        unsigned int gl_demote_state; /* state requested by remote node */
        unsigned long gl_demote_time; /* time of first demote request */
        struct list_head gl_holders;
@@ -267,7 +271,6 @@ struct gfs2_inode {
        u64 i_no_formal_ino;
        u64 i_generation;
        u64 i_eattr;
-        loff_t i_disksize;
        unsigned long i_flags;          /* GIF_... */
        struct gfs2_glock *i_gl; /* Move into i_gh? */
        struct gfs2_holder i_iopen_gh;
@@ -416,11 +419,8 @@ struct gfs2_args {
        char ar_locktable[GFS2_LOCKNAME_LEN];   /* Name of the Lock Table */
        char ar_hostdata[GFS2_LOCKNAME_LEN];    /* Host specific data */
        unsigned int ar_spectator:1;            /* Don't get a journal */
-        unsigned int ar_ignore_local_fs:1;      /* Ignore optimisations */
        unsigned int ar_localflocks:1;          /* Let the VFS do flock|fcntl */
-        unsigned int ar_localcaching:1;         /* Local caching */
        unsigned int ar_debug:1;                /* Oops on errors */
-        unsigned int ar_upgrade:1;              /* Upgrade ondisk format */
        unsigned int ar_posix_acl:1;            /* Enable posix acls */
        unsigned int ar_quota:2;                /* off/account/on */
        unsigned int ar_suiddir:1;              /* suiddir support */
@@ -497,7 +497,7 @@ struct gfs2_sb_host {
 */
 struct lm_lockstruct {
-        unsigned int ls_jid;
+        int ls_jid;
        unsigned int ls_first;
        unsigned int ls_first_done;
        unsigned int ls_nodir;
@@ -572,6 +572,7 @@ struct gfs2_sbd {
        struct list_head sd_rindex_mru_list;
        struct gfs2_rgrpd *sd_rindex_forward;
        unsigned int sd_rgrps;
+        unsigned int sd_max_rg_data;
        /* Journal index stuff */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 08140f185a37..7aa7d4f8984a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,60 +73,15 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
        return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
 }
-struct gfs2_skip_data {
-        u64     no_addr;
-        int     skipped;
-};
-static int iget_skip_test(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (ip->i_no_addr == data->no_addr) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE)){
-                        data->skipped = 1;
-                        return 0;
-                }
-                return 1;
-        }
-        return 0;
-}
-static int iget_skip_set(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (data->skipped)
-                return 1;
-        inode->i_ino = (unsigned long)(data->no_addr);
-        ip->i_no_addr = data->no_addr;
-        return 0;
-}
-static struct inode *gfs2_iget_skip(struct super_block *sb,
-                                    u64 no_addr)
-{
-        struct gfs2_skip_data data;
-        unsigned long hash = (unsigned long)no_addr;
-        data.no_addr = no_addr;
-        data.skipped = 0;
-        return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
-}
 /**
- * GFS2 lookup code fills in vfs inode contents based on info obtained
+ * gfs2_set_iop - Sets inode operations
- * from directory entry inside gfs2_inode_lookup(). This has caused issues
+ * @inode: The inode with correct i_mode filled in
- * with NFS code path since its get_dentry routine doesn't have the relevant
- * directory entry when gfs2_inode_lookup() is invoked. Part of the code
- * segment inside gfs2_inode_lookup code needs to get moved around.
 *
- * Clears I_NEW as well.
+ * GFS2 lookup code fills in vfs inode contents based on info obtained
- **/
+ * from directory entry inside gfs2_inode_lookup().
+ */
-void gfs2_set_iop(struct inode *inode)
+static void gfs2_set_iop(struct inode *inode)
 {
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        umode_t mode = inode->i_mode;
@@ -149,8 +104,6 @@ void gfs2_set_iop(struct inode *inode)
                inode->i_op = &gfs2_file_iops;
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
        }
-        unlock_new_inode(inode);
 }
 /**
@@ -162,10 +115,8 @@ void gfs2_set_iop(struct inode *inode)
 * Returns: A VFS inode, or an error
 */
-struct inode *gfs2_inode_lookup(struct super_block *sb,
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
-                                unsigned int type,
+                                u64 no_addr, u64 no_formal_ino)
-                                u64 no_addr,
-                                u64 no_formal_ino)
 {
        struct inode *inode;
        struct gfs2_inode *ip;
@@ -195,141 +146,80 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
                error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
                if (unlikely(error))
                        goto fail_iopen;
-                ip->i_iopen_gh.gh_gl->gl_object = ip;
+                ip->i_iopen_gh.gh_gl->gl_object = ip;
                gfs2_glock_put(io_gl);
                io_gl = NULL;
-                if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
-                        goto gfs2_nfsbypass;
-                inode->i_mode = DT2IF(type);
-                /*
-                 * We must read the inode in order to work out its type in
-                 * this case. Note that this doesn't happen often as we normally
-                 * know the type beforehand. This code path only occurs during
-                 * unlinked inode recovery (where it is safe to do this glock,
-                 * which is not true in the general case).
-                 */
                if (type == DT_UNKNOWN) {
-                        struct gfs2_holder gh;
+                        /* Inode glock must be locked already */
-                        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+                        error = gfs2_inode_refresh(GFS2_I(inode));
-                        if (unlikely(error))
+                        if (error)
-                                goto fail_glock;
+                                goto fail_refresh;
-                        /* Inode is now uptodate */
+                } else {
-                        gfs2_glock_dq_uninit(&gh);
+                        inode->i_mode = DT2IF(type);
                }
                gfs2_set_iop(inode);
+                unlock_new_inode(inode);
        }
-gfs2_nfsbypass:
        return inode;
-fail_glock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
+fail_refresh:
+        ip->i_iopen_gh.gh_gl->gl_object = NULL;
+        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 fail_iopen:
        if (io_gl)
                gfs2_glock_put(io_gl);
 fail_put:
-        if (inode->i_state & I_NEW)
+        ip->i_gl->gl_object = NULL;
-                ip->i_gl->gl_object = NULL;
        gfs2_glock_put(ip->i_gl);
 fail:
-        if (inode->i_state & I_NEW)
+        iget_failed(inode);
-                iget_failed(inode);
-        else
-                iput(inode);
        return ERR_PTR(error);
 }
-/**
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
- * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+                                  u64 *no_formal_ino, unsigned int blktype)
- *                               and try to reclaim it by doing iput.
- *
- * This function assumes no rgrp locks are currently held.
- *
- * @sb: The super block
- * no_addr: The inode number
- *
- */
-void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 {
-        struct gfs2_sbd *sdp;
+        struct super_block *sb = sdp->sd_vfs;
-        struct gfs2_inode *ip;
+        struct gfs2_holder i_gh;
-        struct gfs2_glock *io_gl = NULL;
-        int error;
-        struct gfs2_holder gh;
        struct inode *inode;
+        int error;
-        inode = gfs2_iget_skip(sb, no_addr);
+        error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
+                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-        if (!inode)
+        if (error)
-                return;
+                return ERR_PTR(error);
-        /* If it's not a new inode, someone's using it, so leave it alone. */
-        if (!(inode->i_state & I_NEW)) {
-                iput(inode);
-                return;
-        }
-        ip = GFS2_I(inode);
-        sdp = GFS2_SB(inode);
-        ip->i_no_formal_ino = -1;
-        error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+        error = gfs2_check_blk_type(sdp, no_addr, blktype);
-        if (unlikely(error))
+        if (error)
                goto fail;
-        ip->i_gl->gl_object = ip;
-        error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
-        if (unlikely(error))
-                goto fail_put;
-        set_bit(GIF_INVALID, &ip->i_flags);
-        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
-                                   &ip->i_iopen_gh);
-        if (unlikely(error))
-                goto fail_iopen;
-        ip->i_iopen_gh.gh_gl->gl_object = ip;
-        gfs2_glock_put(io_gl);
-        io_gl = NULL;
-        inode->i_mode = DT2IF(DT_UNKNOWN);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
+        if (IS_ERR(inode))
+                goto fail;
-        /*
+        /* Two extra checks for NFS only */
-         * We must read the inode in order to work out its type in
+        if (no_formal_ino) {
-         * this case. Note that this doesn't happen often as we normally
+                error = -ESTALE;
-         * know the type beforehand. This code path only occurs during
+                if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
-         * unlinked inode recovery (where it is safe to do this glock,
+                        goto fail_iput;
-         * which is not true in the general case).
-         */
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
-                                   &gh);
-        if (unlikely(error))
-                goto fail_glock;
-        /* Inode is now uptodate */
+                error = -EIO;
-        gfs2_glock_dq_uninit(&gh);
+                if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
-        gfs2_set_iop(inode);
+                        goto fail_iput;
-        /* The iput will cause it to be deleted. */
+                error = 0;
-        iput(inode);
+        }
-        return;
-fail_glock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
-fail_iopen:
-        if (io_gl)
-                gfs2_glock_put(io_gl);
-fail_put:
-        ip->i_gl->gl_object = NULL;
-        gfs2_glock_put(ip->i_gl);
 fail:
-        iget_failed(inode);
+        gfs2_glock_dq_uninit(&i_gh);
-        return;
+        return error ? ERR_PTR(error) : inode;
+fail_iput:
+        iput(inode);
+        goto fail;
 }
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -359,8 +249,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
         * to do that.
         */
        ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-        ip->i_disksize = be64_to_cpu(str->di_size);
+        i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
-        i_size_write(&ip->i_inode, ip->i_disksize);
        gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
        atime.tv_sec = be64_to_cpu(str->di_atime);
        atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -592,7 +481,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
        }
        if (!is_root) {
-                error = gfs2_permission(dir, MAY_EXEC);
+                error = gfs2_permission(dir, MAY_EXEC, 0);
                if (error)
                        goto out;
        }
@@ -622,7 +511,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
        int error;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                return error;
@@ -999,17 +888,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
        if (error)
                return error;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
-        gfs2_assert_warn(GFS2_SB(inode), !error);
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
@@ -1055,7 +935,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
        str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
        str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-        str->di_size = cpu_to_be64(ip->i_disksize);
+        str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
        str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
        str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
        str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +965,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
               (unsigned long long)ip->i_no_formal_ino);
        printk(KERN_INFO "  no_addr = %llu\n",
               (unsigned long long)ip->i_no_addr);
-        printk(KERN_INFO "  i_disksize = %llu\n",
+        printk(KERN_INFO "  i_size = %llu\n",
-               (unsigned long long)ip->i_disksize);
+               (unsigned long long)i_size_read(&ip->i_inode));
        printk(KERN_INFO "  blocks = %llu\n",
               (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
        printk(KERN_INFO "  i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21de..3e00a66e7cbd 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
 extern int gfs2_internal_read(struct gfs2_inode *ip,
                              struct file_ra_state *ra_state,
                              char *buf, loff_t *pos, unsigned size);
+extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+                                   unsigned int from, unsigned int to);
 extern void gfs2_set_aops(struct inode *inode);
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,11 +82,25 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
        dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
 }
+static inline int gfs2_check_internal_file_size(struct inode *inode,
+                                                u64 minsize, u64 maxsize)
+{
+        u64 size = i_size_read(inode);
+        if (size < minsize || size > maxsize)
+                goto err;
+        if (size & ((1 << inode->i_blkbits) - 1))
+                goto err;
+        return 0;
+err:
+        gfs2_consist_inode(GFS2_I(inode));
+        return -EIO;
+}
-extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
                                       u64 no_addr, u64 no_formal_ino);
-extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+                                         u64 *no_formal_ino,
+                                         unsigned int blktype);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
@@ -96,7 +112,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
                                  const struct qstr *name,
                                  unsigned int mode, dev_t dev);
-extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c2..6e493aee28f8 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
                ret |= LM_OUT_CANCELED;
                goto out;
        case -EAGAIN: /* Try lock fails */
+        case -EDEADLK: /* Deadlock detected */
                goto out;
-        case -EINVAL: /* Invalid */
+        case -ETIMEDOUT: /* Canceled due to timeout */
-        case -ENOMEM: /* Out of memory */
                ret |= LM_OUT_ERROR;
                goto out;
        case 0: /* Success */
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
        return lkf;
 }
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
-                              unsigned int req_state, unsigned int flags)
+                     unsigned int flags)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
-        int error;
        int req;
        u32 lkf;
-        gl->gl_req = req_state;
        req = make_mode(req_state);
        lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
         * Submit the actual lock request.
         */
-        error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+        return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
-                         GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
-        if (error == -EAGAIN)
-                return 0;
-        if (error)
-                return LM_OUT_ERROR;
-        return LM_OUT_ASYNC;
 }
 static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ac750bd31a6f..eb01f3575e10 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        lh->lh_hash = cpu_to_be32(hash);
        bh->b_end_io = end_buffer_write_sync;
-        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-                goto skip_barrier;
        get_bh(bh);
-        submit_bh(WRITE_BARRIER | REQ_META, bh);
+        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-        wait_on_buffer(bh);
-        if (buffer_eopnotsupp(bh)) {
-                clear_buffer_eopnotsupp(bh);
-                set_buffer_uptodate(bh);
-                fs_info(sdp, "barrier sync failed - disabling barriers\n");
-                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
-                lock_buffer(bh);
-skip_barrier:
-                get_bh(bh);
                submit_bh(WRITE_SYNC | REQ_META, bh);
-                wait_on_buffer(bh);
+        else
-        }
+                submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
+        wait_on_buffer(bh);
        if (!buffer_uptodate(bh))
                gfs2_io_error_bh(sdp, bh);
        brelse(bh);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index b1e9630eb46a..ebef7ab6e17e 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
 #include "glock.h"
 #include "quota.h"
 #include "recovery.h"
+#include "dir.h"
 static struct shrinker qd_shrinker = {
        .shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
 {
        int error;
+        gfs2_str2qstr(&gfs2_qdot, ".");
+        gfs2_str2qstr(&gfs2_qdotdot, "..");
        error = gfs2_sys_init();
        if (error)
                return error;
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void)
        error = -ENOMEM;
        gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-                                          WQ_NON_REENTRANT | WQ_RESCUER, 0);
+                                          WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
        if (!gfs_recovery_wq)
                goto fail_wq;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index f3b071f921aa..939739c7b3f9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
                 * activity, but those code paths have their own higher-level
                 * throttling.
                 */
-                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        redirty_page_for_writepage(wbc, page);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4d4b1e8ac64c..777927ce6f79 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
 #define DO 0
 #define UNDO 1
-static const u32 gfs2_old_fs_formats[] = {
-        0
-};
-static const u32 gfs2_old_multihost_formats[] = {
-        0
-};
 /**
 * gfs2_tune_init - Fill a gfs2_tune structure with default values
 * @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
 {
-        unsigned int x;
        if (sb->sb_magic != GFS2_MAGIC ||
            sb->sb_type != GFS2_METATYPE_SB) {
                if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
            sb->sb_multihost_format == GFS2_FORMAT_MULTI)
                return 0;
-        if (sb->sb_fs_format != GFS2_FORMAT_FS) {
+        fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
-                for (x = 0; gfs2_old_fs_formats[x]; x++)
-                        if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
-                                break;
-                if (!gfs2_old_fs_formats[x]) {
-                        printk(KERN_WARNING
-                               "GFS2: code version (%u, %u) is incompatible "
-                               "with ondisk format (%u, %u)\n",
-                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                               sb->sb_fs_format, sb->sb_multihost_format);
-                        printk(KERN_WARNING
-                               "GFS2: I don't know how to upgrade this FS\n");
-                        return -EINVAL;
-                }
-        }
-        if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+        return -EINVAL;
-                for (x = 0; gfs2_old_multihost_formats[x]; x++)
-                        if (gfs2_old_multihost_formats[x] ==
-                            sb->sb_multihost_format)
-                                break;
-                if (!gfs2_old_multihost_formats[x]) {
-                        printk(KERN_WARNING
-                               "GFS2: code version (%u, %u) is incompatible "
-                               "with ondisk format (%u, %u)\n",
-                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                               sb->sb_fs_format, sb->sb_multihost_format);
-                        printk(KERN_WARNING
-                               "GFS2: I don't know how to upgrade this FS\n");
-                        return -EINVAL;
-                }
-        }
-        if (!sdp->sd_args.ar_upgrade) {
-                printk(KERN_WARNING
-                       "GFS2: code version (%u, %u) is incompatible "
-                       "with ondisk format (%u, %u)\n",
-                       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                       sb->sb_fs_format, sb->sb_multihost_format);
-                printk(KERN_INFO
-                       "GFS2: Use the \"upgrade\" mount option to upgrade "
-                       "the FS\n");
-                printk(KERN_INFO "GFS2: See the manual for more details\n");
-                return -EINVAL;
-        }
-        return 0;
 }
 static void end_bio_io_page(struct bio *bio, int error)
@@ -496,7 +440,6 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
                iput(inode);
                return -ENOMEM;
        }
-        dentry->d_op = &gfs2_dops;
        *dptr = dentry;
        return 0;
 }
@@ -586,7 +529,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
        prev_db = 0;
-        for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
+        for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
                bh.b_state = 0;
                bh.b_blocknr = 0;
                bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -1022,7 +965,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
        if (!strcmp("lock_nolock", proto)) {
                lm = &nolock_ops;
                sdp->sd_args.ar_localflocks = 1;
-                sdp->sd_args.ar_localcaching = 1;
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
        } else if (!strcmp("lock_dlm", proto)) {
                lm = &gfs2_dlm_ops;
@@ -1113,8 +1055,6 @@ static int gfs2_journalid_wait(void *word)
 static int wait_on_journal(struct gfs2_sbd *sdp)
 {
-        if (sdp->sd_args.ar_spectator)
-                return 0;
        if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
                return 0;
@@ -1165,6 +1105,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        sb->s_magic = GFS2_MAGIC;
        sb->s_op = &gfs2_super_ops;
+        sb->s_d_op = &gfs2_dops;
        sb->s_export_op = &gfs2_export_ops;
        sb->s_xattr = gfs2_xattr_handlers;
        sb->s_qcop = &gfs2_quotactl_ops;
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        if (error)
                goto fail_sb;
+        /*
+         * If user space has failed to join the cluster or some similar
+         * failure has occurred, then the journal id will contain a
+         * negative (error) number. This will then be returned to the
+         * caller (of the mount syscall). We do this even for spectator
+         * mounts (which just write a jid of 0 to indicate "ok" even though
+         * the jid is unused in the spectator case)
+         */
+        if (sdp->sd_lockstruct.ls_jid < 0) {
+                error = sdp->sd_lockstruct.ls_jid;
+                sdp->sd_lockstruct.ls_jid = 0;
+                goto fail_sb;
+        }
        error = init_inodes(sdp, DO);
        if (error)
                goto fail_sb;
@@ -1264,7 +1219,6 @@ fail_sb:
 fail_locking:
        init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-        invalidate_inodes(sb);
        gfs2_gl_hash_clear(sdp);
        gfs2_lm_unmount(sdp);
 fail_sys:
@@ -1296,12 +1250,11 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
 }
 /**
- * gfs2_get_sb - Get the GFS2 superblock
+ * gfs2_mount - Get the GFS2 superblock
 * @fs_type: The GFS2 filesystem type
 * @flags: Mount flags
 * @dev_name: The name of the device
 * @data: The mount arguments
- * @mnt: The vfsmnt for this mount
 *
 * Q. Why not use get_sb_bdev() ?
 * A. We need to select one of two root directories to mount, independent
@@ -1310,12 +1263,12 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
 * Returns: 0 or -ve on error
 */
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data)
 {
        struct block_device *bdev;
        struct super_block *s;
-        fmode_t mode = FMODE_READ;
+        fmode_t mode = FMODE_READ | FMODE_EXCL;
        int error;
        struct gfs2_args args;
        struct gfs2_sbd *sdp;
@@ -1323,9 +1276,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
-        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
-                return PTR_ERR(bdev);
+                return ERR_CAST(bdev);
        /*
         * once the super is inserted into the list by sget, s_umount
@@ -1344,6 +1297,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        if (IS_ERR(s))
                goto error_bdev;
+        if (s->s_root)
+                blkdev_put(bdev, mode);
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
        args.ar_data = GFS2_DATA_DEFAULT;
@@ -1355,17 +1311,13 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        error = gfs2_mount_args(&args, data);
        if (error) {
                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
-                if (s->s_root)
+                goto error_super;
-                        goto error_super;
-                deactivate_locked_super(s);
-                return error;
        }
        if (s->s_root) {
                error = -EBUSY;
                if ((flags ^ s->s_flags) & MS_RDONLY)
                        goto error_super;
-                close_bdev_exclusive(bdev, mode);
        } else {
                char b[BDEVNAME_SIZE];
@@ -1374,27 +1326,24 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(bdev));
                error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
-                if (error) {
+                if (error)
-                        deactivate_locked_super(s);
+                        goto error_super;
-                        return error;
-                }
                s->s_flags |= MS_ACTIVE;
                bdev->bd_super = s;
        }
        sdp = s->s_fs_info;
-        mnt->mnt_sb = s;
        if (args.ar_meta)
-                mnt->mnt_root = dget(sdp->sd_master_dir);
+                return dget(sdp->sd_master_dir);
        else
-                mnt->mnt_root = dget(sdp->sd_root_dir);
+                return dget(sdp->sd_root_dir);
-        return 0;
 error_super:
        deactivate_locked_super(s);
+        return ERR_PTR(error);
 error_bdev:
-        close_bdev_exclusive(bdev, mode);
+        blkdev_put(bdev, mode);
-        return error;
+        return ERR_PTR(error);
 }
 static int set_meta_super(struct super_block *s, void *ptr)
@@ -1402,8 +1351,8 @@ static int set_meta_super(struct super_block *s, void *ptr)
        return -EINVAL;
 }
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
-                            const char *dev_name, void *data, struct vfsmount *mnt)
+                        int flags, const char *dev_name, void *data)
 {
        struct super_block *s;
        struct gfs2_sbd *sdp;
@@ -1414,23 +1363,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
        if (error) {
                printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
                       dev_name, error);
-                return error;
+                return ERR_PTR(error);
        }
        s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
                 path.dentry->d_inode->i_sb->s_bdev);
        path_put(&path);
        if (IS_ERR(s)) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        }
        if ((flags ^ s->s_flags) & MS_RDONLY) {
                deactivate_locked_super(s);
-                return -EBUSY;
+                return ERR_PTR(-EBUSY);
        }
        sdp = s->s_fs_info;
-        mnt->mnt_sb = s;
+        return dget(sdp->sd_master_dir);
-        mnt->mnt_root = dget(sdp->sd_master_dir);
-        return 0;
 }
 static void gfs2_kill_sb(struct super_block *sb)
@@ -1456,7 +1403,7 @@ static void gfs2_kill_sb(struct super_block *sb)
 struct file_system_type gfs2_fs_type = {
        .name = "gfs2",
        .fs_flags = FS_REQUIRES_DEV,
-        .get_sb = gfs2_get_sb,
+        .mount = gfs2_mount,
        .kill_sb = gfs2_kill_sb,
        .owner = THIS_MODULE,
 };
@@ -1464,7 +1411,7 @@ struct file_system_type gfs2_fs_type = {
 struct file_system_type gfs2meta_fs_type = {
        .name = "gfs2meta",
        .fs_flags = FS_REQUIRES_DEV,
-        .get_sb = gfs2_get_sb_meta,
+        .mount = gfs2_mount_meta,
        .owner = THIS_MODULE,
 };
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1009be2c9737..d8b26ac2e20b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -104,8 +104,6 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
 {
        struct inode *inode = NULL;
-        dentry->d_op = &gfs2_dops;
        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
        if (inode && IS_ERR(inode))
                return ERR_CAST(inode);
@@ -164,7 +162,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (error)
                goto out_child;
-        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                goto out_gunlock;
@@ -217,7 +215,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
                        goto out_gunlock_q;
                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                         al->al_rgd->rd_length +
+                                         gfs2_rg_blocks(al) +
                                         2 * RES_DINODE + RES_STATFS +
                                         RES_QUOTA, 0);
                if (error)
@@ -253,7 +251,7 @@ out_parent:
        gfs2_holder_uninit(ghs);
        gfs2_holder_uninit(ghs + 1);
        if (!error) {
-                atomic_inc(&inode->i_count);
+                ihold(inode);
                d_instantiate(dentry, inode);
                mark_inode_dirty(inode);
        }
@@ -287,7 +285,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
        if (IS_APPEND(&dip->i_inode))
                return -EPERM;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                return error;
@@ -406,7 +404,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
        ip = ghs[1].gh_gl->gl_object;
-        ip->i_disksize = size;
        i_size_write(inode, size);
        error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +458,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        ip = ghs[1].gh_gl->gl_object;
        ip->i_inode.i_nlink = 2;
-        ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+        i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
        ip->i_diskflags |= GFS2_DIF_JDATA;
        ip->i_entries = 2;
@@ -470,18 +467,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (!gfs2_assert_withdraw(sdp, !error)) {
                struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
                struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
-                struct qstr str;
-                gfs2_str2qstr(&str, ".");
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+                gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
                dent->de_inum = di->di_num; /* already GFS2 endian */
                dent->de_type = cpu_to_be16(DT_DIR);
                di->di_entries = cpu_to_be32(1);
-                gfs2_str2qstr(&str, "..");
                dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
-                gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+                gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
                gfs2_inum_out(dip, dent);
                dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +516,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
                       struct gfs2_inode *ip)
 {
-        struct qstr dotname;
        int error;
        if (ip->i_entries != 2) {
@@ -539,13 +532,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
        if (error)
                return error;
-        gfs2_str2qstr(&dotname, ".");
+        error = gfs2_dir_del(ip, &gfs2_qdot);
-        error = gfs2_dir_del(ip, &dotname);
        if (error)
                return error;
-        gfs2_str2qstr(&dotname, "..");
+        error = gfs2_dir_del(ip, &gfs2_qdotdot);
-        error = gfs2_dir_del(ip, &dotname);
        if (error)
                return error;
@@ -694,11 +685,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
        struct inode *dir = &to->i_inode;
        struct super_block *sb = dir->i_sb;
        struct inode *tmp;
-        struct qstr dotdot;
        int error = 0;
-        gfs2_str2qstr(&dotdot, "..");
        igrab(dir);
        for (;;) {
@@ -711,7 +699,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
                        break;
                }
-                tmp = gfs2_lookupi(dir, &dotdot, 1);
+                tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
                if (IS_ERR(tmp)) {
                        error = PTR_ERR(tmp);
                        break;
@@ -744,7 +732,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
        struct gfs2_inode *nip = NULL;
        struct gfs2_sbd *sdp = GFS2_SB(odir);
-        struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+        struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
        struct gfs2_rgrpd *nrgd;
        unsigned int num_gh;
        int dir_rename = 0;
@@ -758,6 +746,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        return 0;
        }
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                return error;
        if (odip != ndip) {
                error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -827,7 +818,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        }
                }
        } else {
-                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
+                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
                if (error)
                        goto out_gunlock;
@@ -862,7 +853,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        /* Check out the dir to be renamed */
        if (dir_rename) {
-                error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+                error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
                if (error)
                        goto out_gunlock;
        }
@@ -887,12 +878,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                al->al_requested = sdp->sd_max_dirres;
-                error = gfs2_inplace_reserve(ndip);
+                error = gfs2_inplace_reserve_ri(ndip);
                if (error)
                        goto out_gunlock_q;
                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                         al->al_rgd->rd_length +
+                                         gfs2_rg_blocks(al) +
                                         4 * RES_DINODE + 4 * RES_LEAF +
                                         RES_STATFS + RES_QUOTA + 4, 0);
                if (error)
@@ -920,9 +911,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        }
        if (dir_rename) {
-                struct qstr name;
-                gfs2_str2qstr(&name, "..");
                error = gfs2_change_nlink(ndip, +1);
                if (error)
                        goto out_end_trans;
@@ -930,7 +918,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                if (error)
                        goto out_end_trans;
-                error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR);
+                error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
                if (error)
                        goto out_end_trans;
        } else {
@@ -972,6 +960,7 @@ out_gunlock_r:
        if (r_gh.gh_gl)
                gfs2_glock_dq_uninit(&r_gh);
 out:
+        gfs2_glock_dq_uninit(&ri_gh);
        return error;
 }
@@ -990,7 +979,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
        struct gfs2_holder i_gh;
        struct buffer_head *dibh;
-        unsigned int x;
+        unsigned int x, size;
        char *buf;
        int error;
@@ -1002,7 +991,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
                return NULL;
        }
-        if (!ip->i_disksize) {
+        size = (unsigned int)i_size_read(&ip->i_inode);
+        if (size == 0) {
                gfs2_consist_inode(ip);
                buf = ERR_PTR(-EIO);
                goto out;
@@ -1014,7 +1004,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
                goto out;
        }
-        x = ip->i_disksize + 1;
+        x = size + 1;
        buf = kmalloc(x, GFP_NOFS);
        if (!buf)
                buf = ERR_PTR(-ENOMEM);
@@ -1047,13 +1037,17 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 * Returns: errno
 */
-int gfs2_permission(struct inode *inode, int mask)
+int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_inode *ip;
        struct gfs2_holder i_gh;
        int error;
        int unlock = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        ip = GFS2_I(inode);
        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
@@ -1064,42 +1058,17 @@ int gfs2_permission(struct inode *inode, int mask)
        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
                error = -EACCES;
        else
-                error = generic_permission(inode, mask, gfs2_check_acl);
+                error = generic_permission(inode, mask, flags, gfs2_check_acl);
        if (unlock)
                gfs2_glock_dq_uninit(&i_gh);
        return error;
 }
-/*
- * XXX(truncate): the truncate_setsize calls should be moved to the end.
- */
-static int setattr_size(struct inode *inode, struct iattr *attr)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        int error;
-        if (attr->ia_size != ip->i_disksize) {
-                error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-                if (error)
-                        return error;
-                truncate_setsize(inode, attr->ia_size);
-                gfs2_trans_end(sdp);
-        }
-        error = gfs2_truncatei(ip, attr->ia_size);
-        if (error && (inode->i_size != ip->i_disksize))
-                i_size_write(inode, ip->i_disksize);
-        return error;
-}
 static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct buffer_head *dibh;
        u32 ouid, ogid, nuid, ngid;
        int error;
@@ -1130,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out_gunlock_q;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+        error = gfs2_setattr_simple(ip, attr);
        if (error)
                goto out_end_trans;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                int error;
-                error = vmtruncate(inode, attr->ia_size);
-                gfs2_assert_warn(sdp, !error);
-        }
-        setattr_copy(inode, attr);
-        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
                gfs2_quota_change(ip, -blocks, ouid, ogid);
@@ -1195,7 +1149,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
                goto out;
        if (attr->ia_valid & ATTR_SIZE)
-                error = setattr_size(inode, attr);
+                error = gfs2_setattr_size(inode, attr->ia_size);
        else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
                error = setattr_chown(inode, attr);
        else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 1bc6b5695e6d..a689901963de 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                             struct fs_disk_quota *fdq)
 {
        struct inode *inode = &ip->i_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct address_space *mapping = inode->i_mapping;
        unsigned long index = loc >> PAGE_CACHE_SHIFT;
        unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
@@ -658,13 +659,17 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        qd->qd_qb.qb_value = qp->qu_value;
        if (fdq) {
                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_warn = qp->qu_warn;
                }
                if (fdq->d_fieldmask & FS_DQ_BHARD) {
-                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_limit = qp->qu_limit;
                }
+                if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+                        qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                        qd->qd_qb.qb_value = qp->qu_value;
+                }
        }
        /* Write the quota into the quota file on disk */
@@ -735,10 +740,8 @@ get_a_page:
                goto out;
        size = loc + sizeof(struct gfs2_quota);
-        if (size > inode->i_size) {
+        if (size > inode->i_size)
-                ip->i_disksize = size;
                i_size_write(inode, size);
-        }
        inode->i_mtime = inode->i_atime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
@@ -817,7 +820,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                goto out_alloc;
        if (nalloc)
-                blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
+                blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
        error = gfs2_trans_begin(sdp, blocks, 0);
        if (error)
@@ -1190,18 +1193,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-        unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
+        u64 size = i_size_read(sdp->sd_qc_inode);
+        unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
        unsigned int x, slot = 0;
        unsigned int found = 0;
        u64 dblock;
        u32 extlen = 0;
        int error;
-        if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
+        if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
-            ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
-                gfs2_consist_inode(ip);
                return -EIO;
-        }
        sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
        sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
@@ -1500,9 +1502,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
        fdq->d_version = FS_DQUOT_VERSION;
        fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
        fdq->d_id = id;
-        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
+        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
-        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
+        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
-        fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+        fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
        gfs2_glock_dq_uninit(&q_gh);
 out:
@@ -1511,7 +1513,7 @@ out:
 }
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
 static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                          struct fs_disk_quota *fdq)
@@ -1569,11 +1571,17 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        /* If nothing has changed, this is a no-op */
        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
-            (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+            ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
                fdq->d_fieldmask ^= FS_DQ_BSOFT;
        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
-            (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+            ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
                fdq->d_fieldmask ^= FS_DQ_BHARD;
+        if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+            ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+                fdq->d_fieldmask ^= FS_DQ_BCOUNT;
        if (fdq->d_fieldmask == 0)
                goto out_i;
@@ -1589,6 +1597,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                error = gfs2_inplace_reserve(ip);
                if (error)
                        goto out_alloc;
+                blocks += gfs2_rg_blocks(al);
        }
        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
@@ -1621,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
        .get_dqblk      = gfs2_get_dqblk,
        .set_dqblk      = gfs2_set_dqblk,
 };
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f7f89a94a5a4..f2a02edcac8f 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
        int ro = 0;
        unsigned int pass;
        int error;
+        int jlocked = 0;
-        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+        if (sdp->sd_args.ar_spectator ||
+            (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
                fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
                        jd->jd_jid);
+                jlocked = 1;
                /* Acquire the journal lock so we can do recovery */
                error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
                        jd->jd_jid, t);
        }
-        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
-                gfs2_glock_dq_uninit(&ji_gh);
        gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
-        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+        if (jlocked) {
+                gfs2_glock_dq_uninit(&ji_gh);
                gfs2_glock_dq_uninit(&j_gh);
+        }
        fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
        goto done;
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
 fail_gunlock_tr:
        gfs2_glock_dq_uninit(&t_gh);
 fail_gunlock_ji:
-        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+        if (jlocked) {
                gfs2_glock_dq_uninit(&ji_gh);
 fail_gunlock_j:
                gfs2_glock_dq_uninit(&j_gh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e45..7293ea27020c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
-                if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
+                if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@ -583,12 +583,14 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 * Returns: 0 on successful update, error code otherwise
 */
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
        struct file_ra_state ra_state;
-        u64 rgrp_count = ip->i_disksize;
+        u64 rgrp_count = i_size_read(inode);
+        struct gfs2_rgrpd *rgd;
+        unsigned int max_data = 0;
        int error;
        do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,40 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
                }
        }
-        sdp->sd_rindex_uptodate = 1;
+        list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-        return 0;
+                if (rgd->rd_data > max_data)
-}
+                        max_data = rgd->rd_data;
+        sdp->sd_max_rg_data = max_data;
-/**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct inode *inode = &ip->i_inode;
-        struct file_ra_state ra_state;
-        int error;
-        file_ra_state_init(&ra_state, inode->i_mapping);
-        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
-                /* Ignore partials */
-                if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                    ip->i_disksize)
-                        break;
-                error = read_rindex_entry(ip, &ra_state);
-                if (error) {
-                        clear_rgrpdi(sdp);
-                        return error;
-                }
-        }
        sdp->sd_rindex_uptodate = 1;
        return 0;
 }
@@ -854,8 +826,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                if ((start + nr_sects) != blk) {
                                        rv = blkdev_issue_discard(bdev, start,
                                                            nr_sects, GFP_NOFS,
-                                                            BLKDEV_IFL_WAIT |
+                                                            0);
-                                                            BLKDEV_IFL_BARRIER);
                                        if (rv)
                                                goto fail;
                                        nr_sects = 0;
@@ -869,8 +840,7 @@ start_new_extent:
                }
        }
        if (nr_sects) {
-                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
-                                         BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (rv)
                        goto fail;
        }
@@ -953,17 +923,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 *          The inode, if one has been found, in inode.
 */
-static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
-                           u64 skip)
 {
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        unsigned int n;
+        struct gfs2_glock *gl;
+        struct gfs2_inode *ip;
+        int error;
+        int found = 0;
-        for(;;) {
+        while (goal < rgd->rd_data) {
-                if (goal >= rgd->rd_data)
-                        break;
                down_write(&sdp->sd_log_flush_lock);
                n = 1;
                block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -980,11 +951,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
                if (no_addr == skip)
                        continue;
                *last_unlinked = no_addr;
-                return no_addr;
+                error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
+                if (error)
+                        continue;
+                /* If the inode is already in cache, we can ignore it here
+                 * because the existing inode disposal code will deal with
+                 * it when all refs have gone away. Accessing gl_object like
+                 * this is not safe in general. Here it is ok because we do
+                 * not dereference the pointer, and we only need an approx
+                 * answer to whether it is NULL or not.
+                 */
+                ip = gl->gl_object;
+                if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+                        gfs2_glock_put(gl);
+                else
+                        found++;
+                /* Limit reclaim to sensible number of tasks */
+                if (found > 2*NR_CPUS)
+                        return;
        }
        rgd->rd_flags &= ~GFS2_RDF_CHECK;
-        return 0;
+        return;
 }
 /**
@@ -1065,11 +1057,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
 * Try to acquire rgrp in way which avoids contending with others.
 *
 * Returns: errno
- *          unlinked: the block address of an unlinked block to be reclaimed
 */
-static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
-                          u64 *last_unlinked)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1079,7 +1069,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
        int loops = 0;
        int error, rg_locked;
-        *unlinked = 0;
        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
        while (rgd) {
@@ -1096,17 +1085,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        /* If the rg came in already locked, there's no
+                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                           way we can recover from a failed try_rgrp_unlink
+                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                           because that would require an iput which can only
-                           happen after the rgrp is unlocked. */
-                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
-                                                           ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (*unlinked)
-                                return -EAGAIN;
                        /* fall through */
                case GLR_TRYFAILED:
                        rgd = recent_rgrp_next(rgd);
@@ -1135,13 +1117,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                                                            ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (*unlinked)
-                                return -EAGAIN;
                        break;
                case GLR_TRYFAILED:
@@ -1188,47 +1167,52 @@ out:
 * Returns: errno
 */
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+                           char *file, unsigned int line)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
        int error = 0;
-        u64 last_unlinked = NO_BLOCK, unlinked;
+        u64 last_unlinked = NO_BLOCK;
+        int tries = 0;
        if (gfs2_assert_warn(sdp, al->al_requested))
                return -EINVAL;
-try_again:
+        if (hold_rindex) {
-        /* We need to hold the rindex unless the inode we're using is
+                /* We need to hold the rindex unless the inode we're using is
-           the rindex itself, in which case it's already held. */
+                   the rindex itself, in which case it's already held. */
-        if (ip != GFS2_I(sdp->sd_rindex))
-                error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-        else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
-                error = gfs2_ri_update_special(ip);
-        if (error)
-                return error;
-        /* Find an rgrp suitable for allocation.  If it encounters any unlinked
-           dinodes along the way, error will equal -EAGAIN and unlinked will
-           contains it block address. We then need to look up that inode and
-           try to free it, and try the allocation again. */
-        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
-        if (error) {
                if (ip != GFS2_I(sdp->sd_rindex))
-                        gfs2_glock_dq_uninit(&al->al_ri_gh);
+                        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-                if (error != -EAGAIN)
+                else if (!sdp->sd_rgrps) /* We may not have the rindex read
+                                            in, so: */
+                        error = gfs2_ri_update(ip);
+                if (error)
                        return error;
+        }
-                gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
+try_again:
-                /* regardless of whether or not gfs2_process_unlinked_inode
+        do {
-                   was successful, we don't want to repeat it again. */
+                error = get_local_rgrp(ip, &last_unlinked);
-                last_unlinked = unlinked;
+                /* If there is no space, flushing the log may release some */
-                gfs2_log_flush(sdp, NULL);
+                if (error) {
-                error = 0;
+                        if (ip == GFS2_I(sdp->sd_rindex) &&
+                            !sdp->sd_rindex_uptodate) {
+                                error = gfs2_ri_update(ip);
+                                if (error)
+                                        return error;
+                                goto try_again;
+                        }
+                        gfs2_log_flush(sdp, NULL);
+                }
+        } while (error && tries++ < 3);
-                goto try_again;
+        if (error) {
+                if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
+                        gfs2_glock_dq_uninit(&al->al_ri_gh);
+                return error;
        }
        /* no error, so we have the rgrp set in the inode's allocation. */
        al->al_file = file;
        al->al_line = line;
@@ -1257,7 +1241,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
        al->al_rgd = NULL;
        if (al->al_rgd_gh.gh_gl)
                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-        if (ip != GFS2_I(sdp->sd_rindex))
+        if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
                gfs2_glock_dq_uninit(&al->al_ri_gh);
 }
@@ -1496,11 +1480,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct buffer_head *dibh;
        struct gfs2_alloc *al = ip->i_alloc;
-        struct gfs2_rgrpd *rgd = al->al_rgd;
+        struct gfs2_rgrpd *rgd;
        u32 goal, blk;
        u64 block;
        int error;
+        /* Only happens if there is a bug in gfs2, return something distinctive
+         * to ensure that it is noticed.
+         */
+        if (al == NULL)
+                return -ECANCELED;
+        rgd = al->al_rgd;
        if (rgrp_contains_block(rgd, ip->i_goal))
                goal = ip->i_goal - rgd->rd_data0;
        else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d89557..50c2bb04369c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,13 +39,16 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
        ip->i_alloc = NULL;
 }
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
-                                  unsigned int line);
+                                  char *file, unsigned int line);
 #define gfs2_inplace_reserve(ip) \
-gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+        gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
+#define gfs2_inplace_reserve_ri(ip) \
+        gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
+extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 77cb9f830ee4..ec73ed70bae1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
        {Opt_locktable, "locktable=%s"},
        {Opt_hostdata, "hostdata=%s"},
        {Opt_spectator, "spectator"},
+        {Opt_spectator, "norecovery"},
        {Opt_ignore_local_fs, "ignore_local_fs"},
        {Opt_localflocks, "localflocks"},
        {Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                        args->ar_spectator = 1;
                        break;
                case Opt_ignore_local_fs:
-                        args->ar_ignore_local_fs = 1;
+                        /* Retained for backwards compat only */
                        break;
                case Opt_localflocks:
                        args->ar_localflocks = 1;
                        break;
                case Opt_localcaching:
-                        args->ar_localcaching = 1;
+                        /* Retained for backwards compat only */
                        break;
                case Opt_debug:
                        if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                        args->ar_debug = 0;
                        break;
                case Opt_upgrade:
-                        args->ar_upgrade = 1;
+                        /* Retained for backwards compat only */
                        break;
                case Opt_acl:
                        args->ar_posix_acl = 1;
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        u64 size = i_size_read(jd->jd_inode);
-        if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
+        if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
-            (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
-                gfs2_consist_inode(ip);
                return -EIO;
-        }
-        jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
-        if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) {
+        jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
+        if (gfs2_write_alloc_required(ip, 0, size)) {
                gfs2_consist_inode(ip);
                return -EIO;
        }
@@ -857,7 +857,6 @@ restart:
        gfs2_clear_rgrpd(sdp);
        gfs2_jindex_free(sdp);
        /*  Take apart glock structures and buffer lists  */
-        invalidate_inodes(sdp->sd_vfs);
        gfs2_gl_hash_clear(sdp);
        /*  Unmount the locking protocol  */
        gfs2_lm_unmount(sdp);
@@ -1129,9 +1128,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        /* Some flags must not be changed */
        if (args_neq(&args, &sdp->sd_args, spectator) ||
-            args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
            args_neq(&args, &sdp->sd_args, localflocks) ||
-            args_neq(&args, &sdp->sd_args, localcaching) ||
            args_neq(&args, &sdp->sd_args, meta))
                return -EINVAL;
@@ -1234,16 +1231,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
        if (args->ar_spectator)
                seq_printf(s, ",spectator");
-        if (args->ar_ignore_local_fs)
-                seq_printf(s, ",ignore_local_fs");
        if (args->ar_localflocks)
                seq_printf(s, ",localflocks");
-        if (args->ar_localcaching)
-                seq_printf(s, ",localcaching");
        if (args->ar_debug)
                seq_printf(s, ",debug");
-        if (args->ar_upgrade)
-                seq_printf(s, ",upgrade");
        if (args->ar_posix_acl)
                seq_printf(s, ",acl");
        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -1345,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
        if (error)
                goto out_truncate;
+        ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
        gfs2_glock_dq_wait(&ip->i_iopen_gh);
        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
        error = gfs2_glock_nq(&ip->i_iopen_gh);
@@ -1414,11 +1406,18 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
        return &ip->i_inode;
 }
-static void gfs2_destroy_inode(struct inode *inode)
+static void gfs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(gfs2_inode_cachep, inode);
 }
+static void gfs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, gfs2_i_callback);
+}
 const struct super_operations gfs2_super_ops = {
        .alloc_inode            = gfs2_alloc_inode,
        .destroy_inode          = gfs2_destroy_inode,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ccacffd2faaa..748ccb557c18 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
        if (gltype > LM_TYPE_JOURNAL)
                return -EINVAL;
-        glops = gfs2_glops_list[gltype];
+        if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
+                glops = &gfs2_trans_glops;
+        else
+                glops = gfs2_glops_list[gltype];
        if (glops == NULL)
                return -EINVAL;
        if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
 {
-        return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
+        return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
 }
 static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
-        unsigned jid;
+        int jid;
        int rv;
-        rv = sscanf(buf, "%u", &jid);
+        rv = sscanf(buf, "%d", &jid);
        if (rv != 1)
                return -EINVAL;
        spin_lock(&sdp->sd_jindex_spin);
        rv = -EINVAL;
-        if (sdp->sd_args.ar_spectator)
-                goto out;
        if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
                goto out;
        rv = -EBUSY;
-        if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+        if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
                goto out;
+        rv = 0;
+        if (sdp->sd_args.ar_spectator && jid > 0)
+                rv = jid = -EINVAL;
        sdp->sd_lockstruct.ls_jid = jid;
+        clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
        smp_mb__after_clear_bit();
        wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
-        rv = 0;
 out:
        spin_unlock(&sdp->sd_jindex_spin);
        return rv ? rv : len;
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
        add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
        add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
        if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
-                add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
+                add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
        if (gfs2_uuid_valid(uuid))
                add_uevent_var(env, "UUID=%pUB", uuid);
        return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c14171..cedb0bb96d96 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
        {(1UL << GLF_INVALIDATE_IN_PROGRESS),   "i" },          \
        {(1UL << GLF_REPLY_PENDING),            "r" },          \
        {(1UL << GLF_INITIAL),                  "I" },          \
-        {(1UL << GLF_FROZEN),                   "F" })
+        {(1UL << GLF_FROZEN),                   "F" },          \
+        {(1UL << GLF_QUEUED),                   "q" })
 #ifndef NUMPTY
 #define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908e..fb56b783e028 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
 #define RES_JDATA       1
 #define RES_DATA        1
 #define RES_LEAF        1
+#define RES_RG_HDR      1
 #define RES_RG_BIT      2
 #define RES_EATTR       1
 #define RES_STATFS      1
 #define RES_QUOTA       2
+/* reserve either the number of blocks to be allocated plus the rg header
+ * block, or all of the blocks in the rg, whichever is smaller */
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+{
+        return (al->al_requested < al->al_rgd->rd_length)?
+               al->al_requested + 1 : al->al_rgd->rd_length;
+}
 int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
                     unsigned int revokes);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 776af6eb4bcb..439b61c03262 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                goto out_gunlock_q;
        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-                                 blks + al->al_rgd->rd_length +
+                                 blks + gfs2_rg_blocks(al) +
                                 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
        if (error)
                goto out_ipres;
@@ -1296,10 +1296,8 @@ fail:
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-        struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_ea_location el;
-        struct buffer_head *dibh;
        int error;
        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
        if (error)
                return error;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+        error = gfs2_setattr_simple(ip, attr);
-        if (error)
-                goto out_trans_end;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                int error;
-                error = vmtruncate(inode, attr->ia_size);
-                gfs2_assert_warn(GFS2_SB(inode), !error);
-        }
-        setattr_copy(inode, attr);
-        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
-out_trans_end:
        gfs2_trans_end(sdp);
        return error;
 }
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4129cdb3f0d8..571abe97b42a 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
        fd->search_key = ptr;
        fd->key = ptr + tree->max_key_len + 2;
        dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-        down(&tree->tree_lock);
+        mutex_lock(&tree->tree_lock);
        return 0;
 }
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
        hfs_bnode_put(fd->bnode);
        kfree(fd->search_key);
        dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-        up(&fd->tree->tree_lock);
+        mutex_unlock(&fd->tree->tree_lock);
        fd->tree = NULL;
 }
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 38a0a9917d7f..3ebc437736fe 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
        if (!tree)
                return NULL;
-        init_MUTEX(&tree->tree_lock);
+        mutex_init(&tree->tree_lock);
        spin_lock_init(&tree->hash_lock);
        /* Set the correct compare function */
        tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index cc51905ac21d..2a1d712f85dc 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
        unsigned int depth;
        //unsigned int map1_size, map_size;
-        struct semaphore tree_lock;
+        struct mutex tree_lock;
        unsigned int pages_per_bnode;
        spinlock_t hash_lock;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 2b3b8611b41b..afa66aaa2237 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,8 +25,6 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
        struct inode *inode = NULL;
        int res;
-        dentry->d_op = &hfs_dentry_operations;
        hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
        hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
        res = hfs_brec_read(&fd, &rec, sizeof(rec));
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4f55651aaa51..ad97c2d58287 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@ struct hfs_sb_info {
        u16 blockoffset;
        int fs_div;
-        struct hlist_head rsrc_inodes;
 };
 #define HFS_FLG_BITMAP_DIRTY    0
@@ -215,10 +213,14 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 /* string.c */
 extern const struct dentry_operations hfs_dentry_operations;
-extern int hfs_hash_dentry(struct dentry *, struct qstr *);
+extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
+                struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
                      const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+extern int hfs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 /* trans.c */
 extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
@@ -254,17 +256,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb)
        sb->s_dirt = 1;
 }
-static inline void hfs_buffer_sync(struct buffer_head *bh)
-{
-        while (buffer_locked(bh)) {
-                wait_on_buffer(bh);
-        }
-        if (buffer_dirty(bh)) {
-                ll_rw_block(WRITE, 1, &bh);
-                wait_on_buffer(bh);
-        }
-}
 #define sb_bread512(sb, sec, data) ({                   \
        struct buffer_head *__bh;                       \
        sector_t __block;                               \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 397b7adc7ce6..dffb4e996643 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -524,7 +524,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
        HFS_I(inode)->rsrc_inode = dir;
        HFS_I(dir)->rsrc_inode = inode;
        igrab(dir);
-        hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes);
+        hlist_add_fake(&inode->i_hash);
        mark_inode_dirty(inode);
 out:
        d_add(dentry, inode);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5ac991..1563d5ce5764 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb)
                mdb->drLsMod = hfs_mtime();
                mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
-                hfs_buffer_sync(HFS_SB(sb)->mdb_bh);
+                sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
        }
        return 0;
@@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb)
                HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
                HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
                mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
-                hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh);
+                sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
        }
        if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 927a5af79428..495a976a3cc9 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,7 +51,8 @@ static unsigned char caseorder[256] = {
 /*
 * Hash a string to an integer in a case-independent way
 */
-int hfs_hash_dentry(struct dentry *dentry, struct qstr *this)
+int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *this)
 {
        const unsigned char *name = this->name;
        unsigned int hash, len = this->len;
@@ -92,21 +93,21 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
 * Test for equality of two strings in the HFS filename character ordering.
 * return 1 on failure and 0 on success
 */
-int hfs_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        const unsigned char *n1, *n2;
-        int len;
-        len = s1->len;
        if (len >= HFS_NAMELEN) {
-                if (s2->len < HFS_NAMELEN)
+                if (name->len < HFS_NAMELEN)
                        return 1;
                len = HFS_NAMELEN;
-        } else if (len != s2->len)
+        } else if (len != name->len)
                return 1;
-        n1 = s1->name;
+        n1 = str;
-        n2 = s2->name;
+        n2 = name->name;
        while (len--) {
                if (caseorder[*n1++] != caseorder[*n2++])
                        return 1;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 34235d4bf08b..1b55f704fb22 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -20,7 +20,6 @@
 #include <linux/parser.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include "hfs_fs.h"
@@ -79,15 +78,11 @@ static int hfs_sync_fs(struct super_block *sb, int wait)
 */
 static void hfs_put_super(struct super_block *sb)
 {
-        lock_kernel();
        if (sb->s_dirt)
                hfs_write_super(sb);
        hfs_mdb_close(sb);
        /* release the MDB's resources */
        hfs_mdb_put(sb);
-        unlock_kernel();
 }
 /*
@@ -172,11 +167,18 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
        return i ? &i->vfs_inode : NULL;
 }
-static void hfs_destroy_inode(struct inode *inode)
+static void hfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
 }
+static void hfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hfs_i_callback);
+}
 static const struct super_operations hfs_super_operations = {
        .alloc_inode    = hfs_alloc_inode,
        .destroy_inode  = hfs_destroy_inode,
@@ -385,8 +387,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        INIT_HLIST_HEAD(&sbi->rsrc_inodes);
        res = -EINVAL;
        if (!parse_options((char *)data, sbi)) {
@@ -427,13 +429,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!root_inode)
                goto bail_no_root;
+        sb->s_d_op = &hfs_dentry_operations;
        res = -ENOMEM;
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root)
                goto bail_iput;
-        sb->s_root->d_op = &hfs_dentry_operations;
        /* everything's okay */
        return 0;
@@ -446,17 +447,16 @@ bail:
        return res;
 }
-static int hfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *hfs_mount(struct file_system_type *fs_type,
-                      int flags, const char *dev_name, void *data,
+                      int flags, const char *dev_name, void *data)
-                      struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
 }
 static struct file_system_type hfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "hfs",
-        .get_sb         = hfs_get_sb,
+        .mount          = hfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 7478f5c219aa..19cf291eb91f 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -8,15 +8,20 @@
 * This file contains the code to do various system dependent things.
 */
+#include <linux/namei.h>
 #include "hfs_fs.h"
 /* dentry case-handling: just lowercase everything */
 static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        int diff;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
        if(!inode)
                return 1;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be9..5d799c13205f 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -22,8 +22,9 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
                return -ENOMEM;
        fd->search_key = ptr;
        fd->key = ptr + tree->max_key_len + 2;
-        dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
+        dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",
-        down(&tree->tree_lock);
+                tree->cnid, __builtin_return_address(0));
+        mutex_lock(&tree->tree_lock);
        return 0;
 }
@@ -31,8 +32,9 @@ void hfs_find_exit(struct hfs_find_data *fd)
 {
        hfs_bnode_put(fd->bnode);
        kfree(fd->search_key);
-        dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
+        dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n",
-        up(&fd->tree->tree_lock);
+                fd->tree->cnid, __builtin_return_address(0));
+        mutex_unlock(&fd->tree->tree_lock);
        fd->tree = NULL;
 }
@@ -52,6 +54,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
                rec = (e + b) / 2;
                len = hfs_brec_lenoff(bnode, rec, &off);
                keylen = hfs_brec_keylen(bnode, rec);
+                if (keylen == 0) {
+                        res = -EINVAL;
+                        goto fail;
+                }
                hfs_bnode_read(bnode, fd->key, off, keylen);
                cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
                if (!cmpval) {
@@ -67,6 +73,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
        if (rec != e && e >= 0) {
                len = hfs_brec_lenoff(bnode, e, &off);
                keylen = hfs_brec_keylen(bnode, e);
+                if (keylen == 0) {
+                        res = -EINVAL;
+                        goto fail;
+                }
                hfs_bnode_read(bnode, fd->key, off, keylen);
        }
 done:
@@ -75,6 +85,7 @@ done:
        fd->keylength = keylen;
        fd->entryoffset = off + keylen;
        fd->entrylength = len - keylen;
+fail:
        return res;
 }
@@ -198,6 +209,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
        len = hfs_brec_lenoff(bnode, fd->record, &off);
        keylen = hfs_brec_keylen(bnode, fd->record);
+        if (keylen == 0) {
+                res = -EINVAL;
+                goto out;
+        }
        fd->keyoffset = off;
        fd->keylength = keylen;
        fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03c..1cad80c789cb 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -15,8 +15,10 @@
 #define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8)
-int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
+int hfsplus_block_allocate(struct super_block *sb, u32 size,
+                u32 offset, u32 *max)
 {
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct page *page;
        struct address_space *mapping;
        __be32 *pptr, *curr, *end;
@@ -29,8 +31,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
                return size;
        dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
-        mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+        mutex_lock(&sbi->alloc_mutex);
-        mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+        mapping = sbi->alloc_file->i_mapping;
        page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
        if (IS_ERR(page)) {
                start = size;
@@ -150,16 +152,17 @@ done:
        set_page_dirty(page);
        kunmap(page);
        *max = offset + (curr - pptr) * 32 + i - start;
-        HFSPLUS_SB(sb).free_blocks -= *max;
+        sbi->free_blocks -= *max;
        sb->s_dirt = 1;
        dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
 out:
-        mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+        mutex_unlock(&sbi->alloc_mutex);
        return start;
 }
 int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 {
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct page *page;
        struct address_space *mapping;
        __be32 *pptr, *curr, *end;
@@ -172,11 +175,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
        dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
        /* are all of the bits in range? */
-        if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
+        if ((offset + count) > sbi->total_blocks)
                return -2;
-        mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+        mutex_lock(&sbi->alloc_mutex);
-        mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+        mapping = sbi->alloc_file->i_mapping;
        pnr = offset / PAGE_CACHE_BITS;
        page = read_mapping_page(mapping, pnr, NULL);
        pptr = kmap(page);
@@ -224,9 +227,9 @@ done:
 out:
        set_page_dirty(page);
        kunmap(page);
-        HFSPLUS_SB(sb).free_blocks += len;
+        sbi->free_blocks += len;
        sb->s_dirt = 1;
-        mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+        mutex_unlock(&sbi->alloc_mutex);
        return 0;
 }
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 29da6574ba77..1c42cc5b899f 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -42,7 +42,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
 u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
 {
        __be16 data;
-        // optimize later...
+        /* TODO: optimize later... */
        hfs_bnode_read(node, &data, off, 2);
        return be16_to_cpu(data);
 }
@@ -50,7 +50,7 @@ u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
 u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
 {
        u8 data;
-        // optimize later...
+        /* TODO: optimize later... */
        hfs_bnode_read(node, &data, off, 1);
        return data;
 }
@@ -96,7 +96,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
 void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
 {
        __be16 v = cpu_to_be16(data);
-        // optimize later...
+        /* TODO: optimize later... */
        hfs_bnode_write(node, &v, off, 2);
 }
@@ -212,7 +212,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
                                dst_page--;
                        }
                        src -= len;
-                        memmove(kmap(*dst_page) + src, kmap(*src_page) + src, len);
+                        memmove(kmap(*dst_page) + src,
+                                kmap(*src_page) + src, len);
                        kunmap(*src_page);
                        set_page_dirty(*dst_page);
                        kunmap(*dst_page);
@@ -250,14 +251,16 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
                if (src == dst) {
                        l = min(len, (int)PAGE_CACHE_SIZE - src);
-                        memmove(kmap(*dst_page) + src, kmap(*src_page) + src, l);
+                        memmove(kmap(*dst_page) + src,
+                                kmap(*src_page) + src, l);
                        kunmap(*src_page);
                        set_page_dirty(*dst_page);
                        kunmap(*dst_page);
                        while ((len -= l) != 0) {
                                l = min(len, (int)PAGE_CACHE_SIZE);
-                                memmove(kmap(*++dst_page), kmap(*++src_page), l);
+                                memmove(kmap(*++dst_page),
+                                        kmap(*++src_page), l);
                                kunmap(*src_page);
                                set_page_dirty(*dst_page);
                                kunmap(*dst_page);
@@ -268,7 +271,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
                        do {
                                src_ptr = kmap(*src_page) + src;
                                dst_ptr = kmap(*dst_page) + dst;
-                                if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) {
+                                if (PAGE_CACHE_SIZE - src <
+                                                PAGE_CACHE_SIZE - dst) {
                                        l = PAGE_CACHE_SIZE - src;
                                        src = 0;
                                        dst += l;
@@ -340,7 +344,8 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
                        return;
                tmp->next = node->next;
                cnid = cpu_to_be32(tmp->next);
-                hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
+                hfs_bnode_write(tmp, &cnid,
+                        offsetof(struct hfs_bnode_desc, next), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_head = node->next;
@@ -351,15 +356,15 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
                        return;
                tmp->prev = node->prev;
                cnid = cpu_to_be32(tmp->prev);
-                hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4);
+                hfs_bnode_write(tmp, &cnid,
+                        offsetof(struct hfs_bnode_desc, prev), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_tail = node->prev;
-        // move down?
+        /* move down? */
-        if (!node->prev && !node->next) {
+        if (!node->prev && !node->next)
-                printk(KERN_DEBUG "hfs_btree_del_level\n");
+                dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
-        }
        if (!node->parent) {
                tree->root = 0;
                tree->depth = 0;
@@ -379,16 +384,16 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
        struct hfs_bnode *node;
        if (cnid >= tree->node_count) {
-                printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+                printk(KERN_ERR "hfs: request for non-existent node "
+                                "%d in B*Tree\n",
+                        cnid);
                return NULL;
        }
        for (node = tree->node_hash[hfs_bnode_hash(cnid)];
-             node; node = node->next_hash) {
+                        node; node = node->next_hash)
-                if (node->this == cnid) {
+                if (node->this == cnid)
                        return node;
-                }
-        }
        return NULL;
 }
@@ -402,7 +407,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
        loff_t off;
        if (cnid >= tree->node_count) {
-                printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+                printk(KERN_ERR "hfs: request for non-existent node "
+                                "%d in B*Tree\n",
+                        cnid);
                return NULL;
        }
@@ -429,7 +436,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
        } else {
                spin_unlock(&tree->hash_lock);
                kfree(node);
-                wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags));
+                wait_event(node2->lock_wq,
+                        !test_bit(HFS_BNODE_NEW, &node2->flags));
                return node2;
        }
        spin_unlock(&tree->hash_lock);
@@ -483,7 +491,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
        if (node) {
                hfs_bnode_get(node);
                spin_unlock(&tree->hash_lock);
-                wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags));
+                wait_event(node->lock_wq,
+                        !test_bit(HFS_BNODE_NEW, &node->flags));
                if (test_bit(HFS_BNODE_ERROR, &node->flags))
                        goto node_error;
                return node;
@@ -497,7 +506,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
        if (!test_bit(HFS_BNODE_NEW, &node->flags))
                return node;
-        desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset);
+        desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) +
+                        node->page_offset);
        node->prev = be32_to_cpu(desc->prev);
        node->next = be32_to_cpu(desc->next);
        node->num_recs = be16_to_cpu(desc->num_recs);
@@ -556,11 +566,13 @@ node_error:
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-        //int i;
+#if 0
+        int i;
-        //for (i = 0; i < node->tree->pages_per_bnode; i++)
+        for (i = 0; i < node->tree->pages_per_bnode; i++)
-        //      if (node->page[i])
+                if (node->page[i])
-        //              page_cache_release(node->page[i]);
+                        page_cache_release(node->page[i]);
+#endif
        kfree(node);
 }
@@ -607,7 +619,8 @@ void hfs_bnode_get(struct hfs_bnode *node)
        if (node) {
                atomic_inc(&node->refcnt);
                dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
-                       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+                        node->tree->cnid, node->this,
+                        atomic_read(&node->refcnt));
        }
 }
@@ -619,7 +632,8 @@ void hfs_bnode_put(struct hfs_bnode *node)
                int i;
                dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
-                       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+                        node->tree->cnid, node->this,
+                        atomic_read(&node->refcnt));
                BUG_ON(!atomic_read(&node->refcnt));
                if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
                        return;
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a402..2312de34bd42 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -39,13 +39,17 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
           !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
                retval = node->tree->max_key_len + 2;
        } else {
-                recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
+                recoff = hfs_bnode_read_u16(node,
+                        node->tree->node_size - (rec + 1) * 2);
                if (!recoff)
                        return 0;
-                if (node->tree->attributes & HFS_TREE_BIGKEYS)
-                        retval = hfs_bnode_read_u16(node, recoff) + 2;
+                retval = hfs_bnode_read_u16(node, recoff) + 2;
-                else
+                if (retval > node->tree->max_key_len + 2) {
-                        retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+                        printk(KERN_ERR "hfs: keylen %d too large\n",
+                                retval);
+                        retval = 0;
+                }
        }
        return retval;
 }
@@ -81,7 +85,8 @@ again:
        end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
        end_off = hfs_bnode_read_u16(node, end_rec_off);
        end_rec_off -= 2;
-        dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off);
+        dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+                rec, size, end_off, end_rec_off);
        if (size > end_rec_off - end_off) {
                if (new_node)
                        panic("not enough room!\n");
@@ -96,7 +101,9 @@ again:
        }
        node->num_recs++;
        /* write new last offset */
-        hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+        hfs_bnode_write_u16(node,
+                offsetof(struct hfs_bnode_desc, num_recs),
+                node->num_recs);
        hfs_bnode_write_u16(node, end_rec_off, end_off + size);
        data_off = end_off;
        data_rec_off = end_rec_off + 2;
@@ -148,7 +155,8 @@ skip:
                if (tree->attributes & HFS_TREE_VARIDXKEYS)
                        key_len = be16_to_cpu(fd->search_key->key_len) + 2;
                else {
-                        fd->search_key->key_len = cpu_to_be16(tree->max_key_len);
+                        fd->search_key->key_len =
+                                cpu_to_be16(tree->max_key_len);
                        key_len = tree->max_key_len + 2;
                }
                goto again;
@@ -177,7 +185,8 @@ again:
                mark_inode_dirty(tree->inode);
        }
        hfs_bnode_dump(node);
-        dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength);
+        dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n",
+                fd->record, fd->keylength + fd->entrylength);
        if (!--node->num_recs) {
                hfs_bnode_unlink(node);
                if (!node->parent)
@@ -191,7 +200,9 @@ again:
                __hfs_brec_find(node, fd);
                goto again;
        }
-        hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+        hfs_bnode_write_u16(node,
+                offsetof(struct hfs_bnode_desc, num_recs),
+                node->num_recs);
        if (rec_off == end_off)
                goto skip;
@@ -216,7 +227,7 @@ skip:
 static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 {
        struct hfs_btree *tree;
-        struct hfs_bnode *node, *new_node;
+        struct hfs_bnode *node, *new_node, *next_node;
        struct hfs_bnode_desc node_desc;
        int num_recs, new_rec_off, new_off, old_rec_off;
        int data_start, data_end, size;
@@ -235,6 +246,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
        new_node->type = node->type;
        new_node->height = node->height;
+        if (node->next)
+                next_node = hfs_bnode_find(tree, node->next);
+        else
+                next_node = NULL;
+        if (IS_ERR(next_node)) {
+                hfs_bnode_put(node);
+                hfs_bnode_put(new_node);
+                return next_node;
+        }
        size = tree->node_size / 2 - node->num_recs * 2 - 14;
        old_rec_off = tree->node_size - 4;
        num_recs = 1;
@@ -248,6 +270,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
                /* panic? */
                hfs_bnode_put(node);
                hfs_bnode_put(new_node);
+                if (next_node)
+                        hfs_bnode_put(next_node);
                return ERR_PTR(-ENOSPC);
        }
@@ -302,8 +326,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
        hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
        /* update next bnode header */
-        if (new_node->next) {
+        if (next_node) {
-                struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
                next_node->prev = new_node->this;
                hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
                node_desc.prev = cpu_to_be32(next_node->prev);
@@ -349,7 +372,8 @@ again:
                newkeylen = hfs_bnode_read_u16(node, 14) + 2;
        else
                fd->keylength = newkeylen = tree->max_key_len + 2;
-        dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen);
+        dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n",
+                rec, fd->keylength, newkeylen);
        rec_off = tree->node_size - (rec + 2) * 2;
        end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
@@ -360,7 +384,7 @@ again:
                end_off = hfs_bnode_read_u16(parent, end_rec_off);
                if (end_rec_off - end_off < diff) {
-                        printk(KERN_DEBUG "hfs: splitting index node...\n");
+                        dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n");
                        fd->bnode = parent;
                        new_node = hfs_bnode_split(fd);
                        if (IS_ERR(new_node))
@@ -368,7 +392,8 @@ again:
                        parent = fd->bnode;
                        rec = fd->record;
                        rec_off = tree->node_size - (rec + 2) * 2;
-                        end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
+                        end_rec_off = tree->node_size -
+                                (parent->num_recs + 1) * 2;
                }
        }
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e293..21023d9f8ff3 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
        if (!tree)
                return NULL;
-        init_MUTEX(&tree->tree_lock);
+        mutex_init(&tree->tree_lock);
        spin_lock_init(&tree->hash_lock);
        tree->sb = sb;
        tree->cnid = id;
@@ -39,13 +39,20 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
                goto free_tree;
        tree->inode = inode;
+        if (!HFSPLUS_I(tree->inode)->first_blocks) {
+                printk(KERN_ERR
+                       "hfs: invalid btree extent records (0 size).\n");
+                goto free_inode;
+        }
        mapping = tree->inode->i_mapping;
        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
-                goto free_tree;
+                goto free_inode;
        /* Load the header */
-        head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+        head = (struct hfs_btree_header_rec *)(kmap(page) +
+                sizeof(struct hfs_bnode_desc));
        tree->root = be32_to_cpu(head->root);
        tree->leaf_count = be32_to_cpu(head->leaf_count);
        tree->leaf_head = be32_to_cpu(head->leaf_head);
@@ -57,40 +64,72 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
        tree->max_key_len = be16_to_cpu(head->max_key_len);
        tree->depth = be16_to_cpu(head->depth);
-        /* Set the correct compare function */
+        /* Verify the tree and set the correct compare function */
-        if (id == HFSPLUS_EXT_CNID) {
+        switch (id) {
+        case HFSPLUS_EXT_CNID:
+                if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
+                        printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+                                tree->max_key_len);
+                        goto fail_page;
+                }
+                if (tree->attributes & HFS_TREE_VARIDXKEYS) {
+                        printk(KERN_ERR "hfs: invalid extent btree flag\n");
+                        goto fail_page;
+                }
                tree->keycmp = hfsplus_ext_cmp_key;
-        } else if (id == HFSPLUS_CAT_CNID) {
+                break;
-                if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
+        case HFSPLUS_CAT_CNID:
+                if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
+                        printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+                                tree->max_key_len);
+                        goto fail_page;
+                }
+                if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+                        printk(KERN_ERR "hfs: invalid catalog btree flag\n");
+                        goto fail_page;
+                }
+                if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
                    (head->key_type == HFSPLUS_KEY_BINARY))
                        tree->keycmp = hfsplus_cat_bin_cmp_key;
                else {
                        tree->keycmp = hfsplus_cat_case_cmp_key;
-                        HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD;
+                        set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
                }
-        } else {
+                break;
+        default:
                printk(KERN_ERR "hfs: unknown B*Tree requested\n");
                goto fail_page;
        }
+        if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
+                printk(KERN_ERR "hfs: invalid btree flag\n");
+                goto fail_page;
+        }
        size = tree->node_size;
        if (!is_power_of_2(size))
                goto fail_page;
        if (!tree->node_count)
                goto fail_page;
        tree->node_size_shift = ffs(size) - 1;
-        tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        tree->pages_per_bnode =
+                (tree->node_size + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT;
        kunmap(page);
        page_cache_release(page);
        return tree;
 fail_page:
-        tree->inode->i_mapping->a_ops = &hfsplus_aops;
        page_cache_release(page);
- free_tree:
+ free_inode:
+        tree->inode->i_mapping->a_ops = &hfsplus_aops;
        iput(tree->inode);
+ free_tree:
        kfree(tree);
        return NULL;
 }
@@ -108,8 +147,10 @@ void hfs_btree_close(struct hfs_btree *tree)
                while ((node = tree->node_hash[i])) {
                        tree->node_hash[i] = node->next_hash;
                        if (atomic_read(&node->refcnt))
-                                printk(KERN_CRIT "hfs: node %d:%d still has %d user(s)!\n",
+                                printk(KERN_CRIT "hfs: node %d:%d "
-                                        node->tree->cnid, node->this, atomic_read(&node->refcnt));
+                                                "still has %d user(s)!\n",
+                                        node->tree->cnid, node->this,
+                                        atomic_read(&node->refcnt));
                        hfs_bnode_free(node);
                        tree->node_hash_cnt--;
                }
@@ -130,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree)
                return;
        /* Load the header */
        page = node->page[0];
-        head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+        head = (struct hfs_btree_header_rec *)(kmap(page) +
+                sizeof(struct hfs_bnode_desc));
        head->root = cpu_to_be32(tree->root);
        head->leaf_count = cpu_to_be32(tree->leaf_count);
@@ -192,17 +234,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
        while (!tree->free_nodes) {
                struct inode *inode = tree->inode;
+                struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
                u32 count;
                int res;
                res = hfsplus_file_extend(inode);
                if (res)
                        return ERR_PTR(res);
-                HFSPLUS_I(inode).phys_size = inode->i_size =
+                hip->phys_size = inode->i_size =
-                                (loff_t)HFSPLUS_I(inode).alloc_blocks <<
+                        (loff_t)hip->alloc_blocks <<
-                                HFSPLUS_SB(tree->sb).alloc_blksz_shift;
+                                HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
-                HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks <<
+                hip->fs_blocks =
-                                             HFSPLUS_SB(tree->sb).fs_shift;
+                        hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
                inode_set_bytes(inode, inode->i_size);
                count = inode->i_size >> tree->node_size_shift;
                tree->free_nodes = count - tree->node_count;
@@ -235,7 +278,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
                                                tree->free_nodes--;
                                                mark_inode_dirty(tree->inode);
                                                hfs_bnode_put(node);
-                                                return hfs_bnode_create(tree, idx);
+                                                return hfs_bnode_create(tree,
+                                                        idx);
                                        }
                                }
                        }
@@ -250,7 +294,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
                kunmap(*pagep);
                nidx = node->next;
                if (!nidx) {
-                        printk(KERN_DEBUG "hfs: create new bmap node...\n");
+                        dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n");
                        next_node = hfs_bmap_new_bmap(node, idx);
                } else
                        next_node = hfs_bnode_find(tree, nidx);
@@ -292,7 +336,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
                hfs_bnode_put(node);
                if (!i) {
                        /* panic */;
-                        printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this);
+                        printk(KERN_CRIT "hfs: unable to free bnode %u. "
+                                        "bmap not found!\n",
+                                node->this);
                        return;
                }
                node = hfs_bnode_find(tree, i);
@@ -300,7 +346,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
                        return;
                if (node->type != HFS_NODE_MAP) {
                        /* panic */;
-                        printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type);
+                        printk(KERN_CRIT "hfs: invalid bmap found! "
+                                        "(%u,%d)\n",
+                                node->this, node->type);
                        hfs_bnode_put(node);
                        return;
                }
@@ -313,7 +361,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
        m = 1 << (~nidx & 7);
        byte = data[off];
        if (!(byte & m)) {
-                printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type);
+                printk(KERN_CRIT "hfs: trying to free free bnode "
+                                "%u(%d)\n",
+                        node->this, node->type);
                kunmap(page);
                hfs_bnode_put(node);
                return;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf2..b4ba1b319333 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
        key->key_len = cpu_to_be16(6 + ustrlen);
 }
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 {
        if (inode->i_flags & S_IMMUTABLE)
                perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,25 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
                perms->rootflags |= HFSPLUS_FLG_APPEND;
        else
                perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-        HFSPLUS_I(inode).rootflags = perms->rootflags;
-        HFSPLUS_I(inode).userflags = perms->userflags;
+        perms->userflags = HFSPLUS_I(inode)->userflags;
        perms->mode = cpu_to_be16(inode->i_mode);
        perms->owner = cpu_to_be32(inode->i_uid);
        perms->group = cpu_to_be32(inode->i_gid);
+        if (S_ISREG(inode->i_mode))
+                perms->dev = cpu_to_be32(inode->i_nlink);
+        else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+                perms->dev = cpu_to_be32(inode->i_rdev);
+        else
+                perms->dev = 0;
 }
-static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
+static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
+                u32 cnid, struct inode *inode)
 {
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
        if (S_ISDIR(inode->i_mode)) {
                struct hfsplus_cat_folder *folder;
@@ -93,13 +103,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
                memset(folder, 0, sizeof(*folder));
                folder->type = cpu_to_be16(HFSPLUS_FOLDER);
                folder->id = cpu_to_be32(inode->i_ino);
-                HFSPLUS_I(inode).create_date =
+                HFSPLUS_I(inode)->create_date =
                        folder->create_date =
                        folder->content_mod_date =
                        folder->attribute_mod_date =
                        folder->access_date = hfsp_now2mt();
-                hfsplus_set_perms(inode, &folder->permissions);
+                hfsplus_cat_set_perms(inode, &folder->permissions);
-                if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir)
+                if (inode == sbi->hidden_dir)
                        /* invisible and namelocked */
                        folder->user_info.frFlags = cpu_to_be16(0x5000);
                return sizeof(*folder);
@@ -111,28 +121,40 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
                file->type = cpu_to_be16(HFSPLUS_FILE);
                file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
                file->id = cpu_to_be32(cnid);
-                HFSPLUS_I(inode).create_date =
+                HFSPLUS_I(inode)->create_date =
                        file->create_date =
                        file->content_mod_date =
                        file->attribute_mod_date =
                        file->access_date = hfsp_now2mt();
                if (cnid == inode->i_ino) {
-                        hfsplus_set_perms(inode, &file->permissions);
+                        hfsplus_cat_set_perms(inode, &file->permissions);
                        if (S_ISLNK(inode->i_mode)) {
-                                file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
+                                file->user_info.fdType =
-                                file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
+                                        cpu_to_be32(HFSP_SYMLINK_TYPE);
+                                file->user_info.fdCreator =
+                                        cpu_to_be32(HFSP_SYMLINK_CREATOR);
                        } else {
-                                file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type);
+                                file->user_info.fdType =
-                                file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator);
+                                        cpu_to_be32(sbi->type);
+                                file->user_info.fdCreator =
+                                        cpu_to_be32(sbi->creator);
                        }
-                        if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
+                        if (HFSPLUS_FLG_IMMUTABLE &
-                                file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
+                                        (file->permissions.rootflags |
+                                        file->permissions.userflags))
+                                file->flags |=
+                                        cpu_to_be16(HFSPLUS_FILE_LOCKED);
                } else {
-                        file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
+                        file->user_info.fdType =
-                        file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
+                                cpu_to_be32(HFSP_HARDLINK_TYPE);
-                        file->user_info.fdFlags = cpu_to_be16(0x100);
+                        file->user_info.fdCreator =
-                        file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date;
+                                cpu_to_be32(HFSP_HFSPLUS_CREATOR);
-                        file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev);
+                        file->user_info.fdFlags =
+                                cpu_to_be16(0x100);
+                        file->create_date =
+                                HFSPLUS_I(sbi->hidden_dir)->create_date;
+                        file->permissions.dev =
+                                cpu_to_be32(HFSPLUS_I(inode)->linkid);
                }
                return sizeof(*file);
        }
@@ -173,27 +195,30 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
                return -EIO;
        }
-        hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID),
+        hfsplus_cat_build_key_uni(fd->search_key,
-                                 &tmp.thread.nodeName);
+                be32_to_cpu(tmp.thread.parentID),
+                &tmp.thread.nodeName);
        return hfs_brec_find(fd);
 }
-int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
+int hfsplus_create_cat(u32 cnid, struct inode *dir,
+                struct qstr *str, struct inode *inode)
 {
+        struct super_block *sb = dir->i_sb;
        struct hfs_find_data fd;
-        struct super_block *sb;
        hfsplus_cat_entry entry;
        int entry_size;
        int err;
-        dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
+        dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n",
-        sb = dir->i_sb;
+                str->name, cnid, inode->i_nlink);
-        hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
-        entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
+        entry_size = hfsplus_fill_cat_thread(sb, &entry,
+                S_ISDIR(inode->i_mode) ?
                        HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
-                        dir->i_ino, str);
+                dir->i_ino, str);
        err = hfs_brec_find(&fd);
        if (err != -ENOENT) {
                if (!err)
@@ -219,7 +244,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino
        dir->i_size++;
        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(dir);
+        hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
        hfs_find_exit(&fd);
        return 0;
@@ -234,16 +260,16 @@ err2:
 int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 {
-        struct super_block *sb;
+        struct super_block *sb = dir->i_sb;
        struct hfs_find_data fd;
        struct hfsplus_fork_raw fork;
        struct list_head *pos;
        int err, off;
        u16 type;
-        dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+        dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n",
-        sb = dir->i_sb;
+                str ? str->name : NULL, cnid);
-        hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        if (!str) {
                int len;
@@ -253,11 +279,15 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
                if (err)
                        goto out;
-                off = fd.entryoffset + offsetof(struct hfsplus_cat_thread, nodeName);
+                off = fd.entryoffset +
+                        offsetof(struct hfsplus_cat_thread, nodeName);
                fd.search_key->cat.parent = cpu_to_be32(dir->i_ino);
-                hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.length, off, 2);
+                hfs_bnode_read(fd.bnode,
+                        &fd.search_key->cat.name.length, off, 2);
                len = be16_to_cpu(fd.search_key->cat.name.length) * 2;
-                hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.unicode, off + 2, len);
+                hfs_bnode_read(fd.bnode,
+                        &fd.search_key->cat.name.unicode,
+                        off + 2, len);
                fd.search_key->key_len = cpu_to_be16(6 + len);
        } else
                hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
@@ -274,12 +304,13 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
                hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA);
 #endif
-                off = fd.entryoffset + offsetof(struct hfsplus_cat_file, rsrc_fork);
+                off = fd.entryoffset +
+                        offsetof(struct hfsplus_cat_file, rsrc_fork);
                hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork));
                hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
        }
-        list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) {
+        list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
                struct hfsplus_readdir_data *rd =
                        list_entry(pos, struct hfsplus_readdir_data, list);
                if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -301,7 +332,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
        dir->i_size--;
        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(dir);
+        hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
 out:
        hfs_find_exit(&fd);
@@ -312,16 +343,16 @@ int hfsplus_rename_cat(u32 cnid,
                       struct inode *src_dir, struct qstr *src_name,
                       struct inode *dst_dir, struct qstr *dst_name)
 {
-        struct super_block *sb;
+        struct super_block *sb = src_dir->i_sb;
        struct hfs_find_data src_fd, dst_fd;
        hfsplus_cat_entry entry;
        int entry_size, type;
        int err = 0;
-        dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
+        dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+                cnid, src_dir->i_ino, src_name->name,
                dst_dir->i_ino, dst_name->name);
-        sb = src_dir->i_sb;
+        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
-        hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
        dst_fd = src_fd;
        /* find the old dir entry and read the data */
@@ -347,7 +378,6 @@ int hfsplus_rename_cat(u32 cnid,
                goto out;
        dst_dir->i_size++;
        dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(dst_dir);
        /* finally remove the old entry */
        hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
@@ -359,7 +389,6 @@ int hfsplus_rename_cat(u32 cnid,
                goto out;
        src_dir->i_size--;
        src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(src_dir);
        /* remove old thread entry */
        hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
@@ -373,7 +402,8 @@ int hfsplus_rename_cat(u32 cnid,
        /* create new thread entry */
        hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
-        entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name);
+        entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
+                dst_dir->i_ino, dst_name);
        err = hfs_brec_find(&dst_fd);
        if (err != -ENOENT) {
                if (!err)
@@ -381,6 +411,9 @@ int hfsplus_rename_cat(u32 cnid,
                goto out;
        }
        err = hfs_brec_insert(&dst_fd, &entry, entry_size);
+        hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY);
+        hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY);
 out:
        hfs_bnode_put(dst_fd.bnode);
        hfs_find_exit(&src_fd);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca88..4df5059c25da 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,9 +37,8 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
        sb = dir->i_sb;
-        dentry->d_op = &hfsplus_dentry_operations;
        dentry->d_fsdata = NULL;
-        hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
 again:
        err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -66,11 +65,17 @@ again:
                        goto fail;
                }
                cnid = be32_to_cpu(entry.file.id);
-                if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
+                if (entry.file.user_info.fdType ==
-                    entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
+                                cpu_to_be32(HFSP_HARDLINK_TYPE) &&
-                    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date ||
+                                entry.file.user_info.fdCreator ==
-                     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) &&
+                                cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
-                    HFSPLUS_SB(sb).hidden_dir) {
+                                (entry.file.create_date ==
+                                        HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->
+                                                create_date ||
+                                entry.file.create_date ==
+                                        HFSPLUS_I(sb->s_root->d_inode)->
+                                                create_date) &&
+                                HFSPLUS_SB(sb)->hidden_dir) {
                        struct qstr str;
                        char name[32];
@@ -83,10 +88,13 @@ again:
                                linkid = 0;
                        } else {
                                dentry->d_fsdata = (void *)(unsigned long)cnid;
-                                linkid = be32_to_cpu(entry.file.permissions.dev);
+                                linkid =
+                                        be32_to_cpu(entry.file.permissions.dev);
                                str.len = sprintf(name, "iNode%d", linkid);
                                str.name = name;
-                                hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str);
+                                hfsplus_cat_build_key(sb, fd.search_key,
+                                        HFSPLUS_SB(sb)->hidden_dir->i_ino,
+                                        &str);
                                goto again;
                        }
                } else if (!dentry->d_fsdata)
@@ -101,7 +109,7 @@ again:
        if (IS_ERR(inode))
                return ERR_CAST(inode);
        if (S_ISREG(inode->i_mode))
-                HFSPLUS_I(inode).dev = linkid;
+                HFSPLUS_I(inode)->linkid = linkid;
 out:
        d_add(dentry, inode);
        return NULL;
@@ -124,7 +132,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (filp->f_pos >= inode->i_size)
                return 0;
-        hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
        err = hfs_brec_find(&fd);
        if (err)
@@ -138,7 +146,8 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                filp->f_pos++;
                /* fall through */
        case 1:
-                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+                        fd.entrylength);
                if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
                        printk(KERN_ERR "hfs: bad catalog folder thread\n");
                        err = -EIO;
@@ -168,20 +177,23 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
-                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+                        fd.entrylength);
                type = be16_to_cpu(entry.type);
                len = HFSPLUS_MAX_STRLEN;
                err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
                if (err)
                        goto out;
                if (type == HFSPLUS_FOLDER) {
-                        if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) {
+                        if (fd.entrylength <
+                                        sizeof(struct hfsplus_cat_folder)) {
                                printk(KERN_ERR "hfs: small dir entry\n");
                                err = -EIO;
                                goto out;
                        }
-                        if (HFSPLUS_SB(sb).hidden_dir &&
+                        if (HFSPLUS_SB(sb)->hidden_dir &&
-                            HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id))
+                            HFSPLUS_SB(sb)->hidden_dir->i_ino ==
+                                        be32_to_cpu(entry.folder.id))
                                goto next;
                        if (filldir(dirent, strbuf, len, filp->f_pos,
                                    be32_to_cpu(entry.folder.id), DT_DIR))
@@ -200,7 +212,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
-        next:
+next:
                filp->f_pos++;
                if (filp->f_pos >= inode->i_size)
                        goto out;
@@ -217,7 +229,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
                filp->private_data = rd;
                rd->file = filp;
-                list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list);
+                list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
        }
        memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
 out:
@@ -229,38 +241,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
 {
        struct hfsplus_readdir_data *rd = file->private_data;
        if (rd) {
+                mutex_lock(&inode->i_mutex);
                list_del(&rd->list);
+                mutex_unlock(&inode->i_mutex);
                kfree(rd);
        }
        return 0;
 }
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
-                          struct nameidata *nd)
-{
-        struct inode *inode;
-        int res;
-        inode = hfsplus_new_inode(dir->i_sb, mode);
-        if (!inode)
-                return -ENOSPC;
-        res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-        if (res) {
-                inode->i_nlink = 0;
-                hfsplus_delete_inode(inode);
-                iput(inode);
-                return res;
-        }
-        hfsplus_instantiate(dentry, inode, inode->i_ino);
-        mark_inode_dirty(inode);
-        return 0;
-}
 static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
                        struct dentry *dst_dentry)
 {
-        struct super_block *sb = dst_dir->i_sb;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
        struct inode *inode = src_dentry->d_inode;
        struct inode *src_dir = src_dentry->d_parent->d_inode;
        struct qstr str;
@@ -270,7 +262,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
        if (HFSPLUS_IS_RSRC(inode))
                return -EPERM;
+        if (!S_ISREG(inode->i_mode))
+                return -EPERM;
+        mutex_lock(&sbi->vh_mutex);
        if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
                for (;;) {
                        get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +274,42 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
                        str.len = sprintf(name, "iNode%d", id);
                        res = hfsplus_rename_cat(inode->i_ino,
                                                 src_dir, &src_dentry->d_name,
-                                                 HFSPLUS_SB(sb).hidden_dir, &str);
+                                                 sbi->hidden_dir, &str);
                        if (!res)
                                break;
                        if (res != -EEXIST)
-                                return res;
+                                goto out;
                }
-                HFSPLUS_I(inode).dev = id;
+                HFSPLUS_I(inode)->linkid = id;
-                cnid = HFSPLUS_SB(sb).next_cnid++;
+                cnid = sbi->next_cnid++;
                src_dentry->d_fsdata = (void *)(unsigned long)cnid;
-                res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
+                res = hfsplus_create_cat(cnid, src_dir,
+                        &src_dentry->d_name, inode);
                if (res)
                        /* panic? */
-                        return res;
+                        goto out;
-                HFSPLUS_SB(sb).file_count++;
+                sbi->file_count++;
        }
-        cnid = HFSPLUS_SB(sb).next_cnid++;
+        cnid = sbi->next_cnid++;
        res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
        if (res)
-                return res;
+                goto out;
        inc_nlink(inode);
        hfsplus_instantiate(dst_dentry, inode, cnid);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
-        HFSPLUS_SB(sb).file_count++;
+        sbi->file_count++;
-        sb->s_dirt = 1;
+        dst_dir->i_sb->s_dirt = 1;
+out:
-        return 0;
+        mutex_unlock(&sbi->vh_mutex);
+        return res;
 }
 static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 {
-        struct super_block *sb = dir->i_sb;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
        struct inode *inode = dentry->d_inode;
        struct qstr str;
        char name[32];
@@ -322,21 +319,24 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
        if (HFSPLUS_IS_RSRC(inode))
                return -EPERM;
+        mutex_lock(&sbi->vh_mutex);
        cnid = (u32)(unsigned long)dentry->d_fsdata;
        if (inode->i_ino == cnid &&
-            atomic_read(&HFSPLUS_I(inode).opencnt)) {
+            atomic_read(&HFSPLUS_I(inode)->opencnt)) {
                str.name = name;
                str.len = sprintf(name, "temp%lu", inode->i_ino);
                res = hfsplus_rename_cat(inode->i_ino,
                                         dir, &dentry->d_name,
-                                         HFSPLUS_SB(sb).hidden_dir, &str);
+                                         sbi->hidden_dir, &str);
-                if (!res)
+                if (!res) {
                        inode->i_flags |= S_DEAD;
-                return res;
+                        drop_nlink(inode);
+                }
+                goto out;
        }
        res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
        if (res)
-                return res;
+                goto out;
        if (inode->i_nlink > 0)
                drop_nlink(inode);
@@ -344,10 +344,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
                clear_nlink(inode);
        if (!inode->i_nlink) {
                if (inode->i_ino != cnid) {
-                        HFSPLUS_SB(sb).file_count--;
+                        sbi->file_count--;
-                        if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
+                        if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
                                res = hfsplus_delete_cat(inode->i_ino,
-                                                         HFSPLUS_SB(sb).hidden_dir,
+                                                         sbi->hidden_dir,
                                                         NULL);
                                if (!res)
                                        hfsplus_delete_inode(inode);
@@ -356,107 +356,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
                } else
                        hfsplus_delete_inode(inode);
        } else
-                HFSPLUS_SB(sb).file_count--;
+                sbi->file_count--;
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
+out:
+        mutex_unlock(&sbi->vh_mutex);
        return res;
 }
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-        struct inode *inode;
-        int res;
-        inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
-        if (!inode)
-                return -ENOSPC;
-        res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-        if (res) {
-                inode->i_nlink = 0;
-                hfsplus_delete_inode(inode);
-                iput(inode);
-                return res;
-        }
-        hfsplus_instantiate(dentry, inode, inode->i_ino);
-        mark_inode_dirty(inode);
-        return 0;
-}
 static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
 {
-        struct inode *inode;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+        struct inode *inode = dentry->d_inode;
        int res;
-        inode = dentry->d_inode;
        if (inode->i_size != 2)
                return -ENOTEMPTY;
+        mutex_lock(&sbi->vh_mutex);
        res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
        if (res)
-                return res;
+                goto out;
        clear_nlink(inode);
        inode->i_ctime = CURRENT_TIME_SEC;
        hfsplus_delete_inode(inode);
        mark_inode_dirty(inode);
-        return 0;
+out:
+        mutex_unlock(&sbi->vh_mutex);
+        return res;
 }
 static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
                           const char *symname)
 {
-        struct super_block *sb;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
        struct inode *inode;
-        int res;
+        int res = -ENOSPC;
-        sb = dir->i_sb;
+        mutex_lock(&sbi->vh_mutex);
-        inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO);
+        inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
        if (!inode)
-                return -ENOSPC;
+                goto out;
        res = page_symlink(inode, symname, strlen(symname) + 1);
-        if (res) {
+        if (res)
-                inode->i_nlink = 0;
+                goto out_err;
-                hfsplus_delete_inode(inode);
-                iput(inode);
-                return res;
-        }
-        mark_inode_dirty(inode);
        res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+        if (res)
+                goto out_err;
-        if (!res) {
+        hfsplus_instantiate(dentry, inode, inode->i_ino);
-                hfsplus_instantiate(dentry, inode, inode->i_ino);
+        mark_inode_dirty(inode);
-                mark_inode_dirty(inode);
+        goto out;
-        }
+out_err:
+        inode->i_nlink = 0;
+        hfsplus_delete_inode(inode);
+        iput(inode);
+out:
+        mutex_unlock(&sbi->vh_mutex);
        return res;
 }
 static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
                         int mode, dev_t rdev)
 {
-        struct super_block *sb;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
        struct inode *inode;
-        int res;
+        int res = -ENOSPC;
-        sb = dir->i_sb;
+        mutex_lock(&sbi->vh_mutex);
-        inode = hfsplus_new_inode(sb, mode);
+        inode = hfsplus_new_inode(dir->i_sb, mode);
        if (!inode)
-                return -ENOSPC;
+                goto out;
+        if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
+                init_special_inode(inode, mode, rdev);
        res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
        if (res) {
                inode->i_nlink = 0;
                hfsplus_delete_inode(inode);
                iput(inode);
-                return res;
+                goto out;
        }
-        init_special_inode(inode, mode, rdev);
        hfsplus_instantiate(dentry, inode, inode->i_ino);
        mark_inode_dirty(inode);
+out:
+        mutex_unlock(&sbi->vh_mutex);
+        return res;
+}
-        return 0;
+static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
+                          struct nameidata *nd)
+{
+        return hfsplus_mknod(dir, dentry, mode, 0);
+}
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
 }
 static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
-                res = hfsplus_unlink(new_dir, new_dentry);
+                if (S_ISDIR(new_dentry->d_inode->i_mode))
+                        res = hfsplus_rmdir(new_dir, new_dentry);
+                else
+                        res = hfsplus_unlink(new_dir, new_dentry);
                if (res)
                        return res;
        }
@@ -492,6 +496,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 };
 const struct file_operations hfsplus_dir_operations = {
+        .fsync          = hfsplus_file_fsync,
        .read           = generic_read_dir,
        .readdir        = hfsplus_readdir,
        .unlocked_ioctl = hfsplus_ioctl,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cda..b1991a2a08e0 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -83,37 +83,60 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
        return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
 }
-static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
+static void __hfsplus_ext_write_extent(struct inode *inode,
+                struct hfs_find_data *fd)
 {
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res;
-        hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start,
+        WARN_ON(!mutex_is_locked(&hip->extents_lock));
-                              HFSPLUS_IS_RSRC(inode) ?  HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+        hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
+                              HFSPLUS_IS_RSRC(inode) ?
+                                HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
        res = hfs_brec_find(fd);
-        if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) {
+        if (hip->extent_state & HFSPLUS_EXT_NEW) {
                if (res != -ENOENT)
                        return;
-                hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec));
+                hfs_brec_insert(fd, hip->cached_extents,
-                HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+                                sizeof(hfsplus_extent_rec));
+                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
        } else {
                if (res)
                        return;
-                hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength);
+                hfs_bnode_write(fd->bnode, hip->cached_extents,
-                HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY;
+                                fd->entryoffset, fd->entrylength);
+                hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
        }
+        /*
+         * We can't just use hfsplus_mark_inode_dirty here, because we
+         * also get called from hfsplus_write_inode, which should not
+         * redirty the inode.  Instead the callers have to be careful
+         * to explicily mark the inode dirty, too.
+         */
+        set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
 }
-void hfsplus_ext_write_extent(struct inode *inode)
+static void hfsplus_ext_write_extent_locked(struct inode *inode)
 {
-        if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) {
+        if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
                struct hfs_find_data fd;
-                hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+                hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
                __hfsplus_ext_write_extent(inode, &fd);
                hfs_find_exit(&fd);
        }
 }
+void hfsplus_ext_write_extent(struct inode *inode)
+{
+        mutex_lock(&HFSPLUS_I(inode)->extents_lock);
+        hfsplus_ext_write_extent_locked(inode);
+        mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
+}
 static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
                                            struct hfsplus_extent *extent,
                                            u32 cnid, u32 block, u8 type)
@@ -130,39 +153,48 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
                return -ENOENT;
        if (fd->entrylength != sizeof(hfsplus_extent_rec))
                return -EIO;
-        hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfsplus_extent_rec));
+        hfs_bnode_read(fd->bnode, extent, fd->entryoffset,
+                sizeof(hfsplus_extent_rec));
        return 0;
 }
-static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
+static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
+                struct inode *inode, u32 block)
 {
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res;
-        if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY)
+        WARN_ON(!mutex_is_locked(&hip->extents_lock));
+        if (hip->extent_state & HFSPLUS_EXT_DIRTY)
                __hfsplus_ext_write_extent(inode, fd);
-        res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino,
+        res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
-                                        block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+                                        block, HFSPLUS_IS_RSRC(inode) ?
+                                                HFSPLUS_TYPE_RSRC :
+                                                HFSPLUS_TYPE_DATA);
        if (!res) {
-                HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block);
+                hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
-                HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents);
+                hip->cached_blocks =
+                        hfsplus_ext_block_count(hip->cached_extents);
        } else {
-                HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
+                hip->cached_start = hip->cached_blocks = 0;
-                HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
        }
        return res;
 }
 static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
 {
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        struct hfs_find_data fd;
        int res;
-        if (block >= HFSPLUS_I(inode).cached_start &&
+        if (block >= hip->cached_start &&
-            block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks)
+            block < hip->cached_start + hip->cached_blocks)
                return 0;
-        hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+        hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
        res = __hfsplus_ext_cache_extent(&fd, inode, block);
        hfs_find_exit(&fd);
        return res;
@@ -172,21 +204,22 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
 int hfsplus_get_block(struct inode *inode, sector_t iblock,
                      struct buffer_head *bh_result, int create)
 {
-        struct super_block *sb;
+        struct super_block *sb = inode->i_sb;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res = -EIO;
        u32 ablock, dblock, mask;
+        int was_dirty = 0;
        int shift;
-        sb = inode->i_sb;
        /* Convert inode block to disk allocation block */
-        shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits;
+        shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
-        ablock = iblock >> HFSPLUS_SB(sb).fs_shift;
+        ablock = iblock >> sbi->fs_shift;
-        if (iblock >= HFSPLUS_I(inode).fs_blocks) {
+        if (iblock >= hip->fs_blocks) {
-                if (iblock > HFSPLUS_I(inode).fs_blocks || !create)
+                if (iblock > hip->fs_blocks || !create)
                        return -EIO;
-                if (ablock >= HFSPLUS_I(inode).alloc_blocks) {
+                if (ablock >= hip->alloc_blocks) {
                        res = hfsplus_file_extend(inode);
                        if (res)
                                return res;
@@ -194,36 +227,46 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
        } else
                create = 0;
-        if (ablock < HFSPLUS_I(inode).first_blocks) {
+        if (ablock < hip->first_blocks) {
-                dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock);
+                dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
                goto done;
        }
        if (inode->i_ino == HFSPLUS_EXT_CNID)
                return -EIO;
-        mutex_lock(&HFSPLUS_I(inode).extents_lock);
+        mutex_lock(&hip->extents_lock);
+        /*
+         * hfsplus_ext_read_extent will write out a cached extent into
+         * the extents btree.  In that case we may have to mark the inode
+         * dirty even for a pure read of an extent here.
+         */
+        was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY);
        res = hfsplus_ext_read_extent(inode, ablock);
-        if (!res) {
+        if (res) {
-                dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
+                mutex_unlock(&hip->extents_lock);
-                                             HFSPLUS_I(inode).cached_start);
-        } else {
-                mutex_unlock(&HFSPLUS_I(inode).extents_lock);
                return -EIO;
        }
-        mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+        dblock = hfsplus_ext_find_block(hip->cached_extents,
+                                        ablock - hip->cached_start);
+        mutex_unlock(&hip->extents_lock);
 done:
-        dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
+        dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n",
-        mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1;
+                inode->i_ino, (long long)iblock, dblock);
-        map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask));
+        mask = (1 << sbi->fs_shift) - 1;
+        map_bh(bh_result, sb,
+                (dblock << sbi->fs_shift) + sbi->blockoffset +
+                        (iblock & mask));
        if (create) {
                set_buffer_new(bh_result);
-                HFSPLUS_I(inode).phys_size += sb->s_blocksize;
+                hip->phys_size += sb->s_blocksize;
-                HFSPLUS_I(inode).fs_blocks++;
+                hip->fs_blocks++;
                inode_add_bytes(inode, sb->s_blocksize);
-                mark_inode_dirty(inode);
        }
+        if (create || was_dirty)
+                mark_inode_dirty(inode);
        return 0;
 }
@@ -306,7 +349,8 @@ found:
        }
 }
-int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw *fork, int type)
+int hfsplus_free_fork(struct super_block *sb, u32 cnid,
+                struct hfsplus_fork_raw *fork, int type)
 {
        struct hfs_find_data fd;
        hfsplus_extent_rec ext_entry;
@@ -327,7 +371,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
        if (total_blocks == blocks)
                return 0;
-        hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+        hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
        do {
                res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
                                                total_blocks, type);
@@ -348,29 +392,34 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
 int hfsplus_file_extend(struct inode *inode)
 {
        struct super_block *sb = inode->i_sb;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        u32 start, len, goal;
        int res;
-        if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) {
+        if (sbi->alloc_file->i_size * 8 <
-                // extend alloc file
+            sbi->total_blocks - sbi->free_blocks + 8) {
-                printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8,
+                /* extend alloc file */
-                        HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks);
+                printk(KERN_ERR "hfs: extend alloc file! "
+                                "(%llu,%u,%u)\n",
+                        sbi->alloc_file->i_size * 8,
+                        sbi->total_blocks, sbi->free_blocks);
                return -ENOSPC;
        }
-        mutex_lock(&HFSPLUS_I(inode).extents_lock);
+        mutex_lock(&hip->extents_lock);
-        if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
+        if (hip->alloc_blocks == hip->first_blocks)
-                goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
+                goal = hfsplus_ext_lastblock(hip->first_extents);
        else {
-                res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks);
+                res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
                if (res)
                        goto out;
-                goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents);
+                goal = hfsplus_ext_lastblock(hip->cached_extents);
        }
-        len = HFSPLUS_I(inode).clump_blocks;
+        len = hip->clump_blocks;
-        start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len);
+        start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
-        if (start >= HFSPLUS_SB(sb).total_blocks) {
+        if (start >= sbi->total_blocks) {
                start = hfsplus_block_allocate(sb, goal, 0, &len);
                if (start >= goal) {
                        res = -ENOSPC;
@@ -379,56 +428,56 @@ int hfsplus_file_extend(struct inode *inode)
        }
        dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
-        if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) {
-                if (!HFSPLUS_I(inode).first_blocks) {
+        if (hip->alloc_blocks <= hip->first_blocks) {
+                if (!hip->first_blocks) {
                        dprint(DBG_EXTENT, "first extents\n");
                        /* no extents yet */
-                        HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start);
+                        hip->first_extents[0].start_block = cpu_to_be32(start);
-                        HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len);
+                        hip->first_extents[0].block_count = cpu_to_be32(len);
                        res = 0;
                } else {
                        /* try to append to extents in inode */
-                        res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents,
+                        res = hfsplus_add_extent(hip->first_extents,
-                                                 HFSPLUS_I(inode).alloc_blocks,
+                                                 hip->alloc_blocks,
                                                 start, len);
                        if (res == -ENOSPC)
                                goto insert_extent;
                }
                if (!res) {
-                        hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
+                        hfsplus_dump_extent(hip->first_extents);
-                        HFSPLUS_I(inode).first_blocks += len;
+                        hip->first_blocks += len;
                }
        } else {
-                res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents,
+                res = hfsplus_add_extent(hip->cached_extents,
-                                         HFSPLUS_I(inode).alloc_blocks -
+                                         hip->alloc_blocks - hip->cached_start,
-                                         HFSPLUS_I(inode).cached_start,
                                         start, len);
                if (!res) {
-                        hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
+                        hfsplus_dump_extent(hip->cached_extents);
-                        HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
+                        hip->extent_state |= HFSPLUS_EXT_DIRTY;
-                        HFSPLUS_I(inode).cached_blocks += len;
+                        hip->cached_blocks += len;
                } else if (res == -ENOSPC)
                        goto insert_extent;
        }
 out:
-        mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+        mutex_unlock(&hip->extents_lock);
        if (!res) {
-                HFSPLUS_I(inode).alloc_blocks += len;
+                hip->alloc_blocks += len;
-                mark_inode_dirty(inode);
+                hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
        }
        return res;
 insert_extent:
        dprint(DBG_EXTENT, "insert new extent\n");
-        hfsplus_ext_write_extent(inode);
+        hfsplus_ext_write_extent_locked(inode);
-        memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
+        memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
-        HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start);
+        hip->cached_extents[0].start_block = cpu_to_be32(start);
-        HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len);
+        hip->cached_extents[0].block_count = cpu_to_be32(len);
-        hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
+        hfsplus_dump_extent(hip->cached_extents);
-        HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
+        hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW;
-        HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks;
+        hip->cached_start = hip->alloc_blocks;
-        HFSPLUS_I(inode).cached_blocks = len;
+        hip->cached_blocks = len;
        res = 0;
        goto out;
@@ -437,13 +486,16 @@ insert_extent:
 void hfsplus_file_truncate(struct inode *inode)
 {
        struct super_block *sb = inode->i_sb;
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        struct hfs_find_data fd;
        u32 alloc_cnt, blk_cnt, start;
        int res;
-        dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino,
+        dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n",
-               (long long)HFSPLUS_I(inode).phys_size, inode->i_size);
+                inode->i_ino, (long long)hip->phys_size,
-        if (inode->i_size > HFSPLUS_I(inode).phys_size) {
+                inode->i_size);
+        if (inode->i_size > hip->phys_size) {
                struct address_space *mapping = inode->i_mapping;
                struct page *page;
                void *fsdata;
@@ -455,52 +507,55 @@ void hfsplus_file_truncate(struct inode *inode)
                                                &page, &fsdata);
                if (res)
                        return;
-                res = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
+                res = pagecache_write_end(NULL, mapping, size,
+                        0, 0, page, fsdata);
                if (res < 0)
                        return;
                mark_inode_dirty(inode);
                return;
-        } else if (inode->i_size == HFSPLUS_I(inode).phys_size)
+        } else if (inode->i_size == hip->phys_size)
                return;
-        blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift;
+        blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
-        alloc_cnt = HFSPLUS_I(inode).alloc_blocks;
+                        HFSPLUS_SB(sb)->alloc_blksz_shift;
+        alloc_cnt = hip->alloc_blocks;
        if (blk_cnt == alloc_cnt)
                goto out;
-        mutex_lock(&HFSPLUS_I(inode).extents_lock);
+        mutex_lock(&hip->extents_lock);
-        hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+        hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
        while (1) {
-                if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
+                if (alloc_cnt == hip->first_blocks) {
-                        hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents,
+                        hfsplus_free_extents(sb, hip->first_extents,
                                             alloc_cnt, alloc_cnt - blk_cnt);
-                        hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
+                        hfsplus_dump_extent(hip->first_extents);
-                        HFSPLUS_I(inode).first_blocks = blk_cnt;
+                        hip->first_blocks = blk_cnt;
                        break;
                }
                res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
                if (res)
                        break;
-                start = HFSPLUS_I(inode).cached_start;
+                start = hip->cached_start;
-                hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents,
+                hfsplus_free_extents(sb, hip->cached_extents,
                                     alloc_cnt - start, alloc_cnt - blk_cnt);
-                hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
+                hfsplus_dump_extent(hip->cached_extents);
                if (blk_cnt > start) {
-                        HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
+                        hip->extent_state |= HFSPLUS_EXT_DIRTY;
                        break;
                }
                alloc_cnt = start;
-                HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
+                hip->cached_start = hip->cached_blocks = 0;
-                HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
                hfs_brec_remove(&fd);
        }
        hfs_find_exit(&fd);
-        mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+        mutex_unlock(&hip->extents_lock);
-        HFSPLUS_I(inode).alloc_blocks = blk_cnt;
+        hip->alloc_blocks = blk_cnt;
 out:
-        HFSPLUS_I(inode).phys_size = inode->i_size;
+        hip->phys_size = inode->i_size;
-        HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+        hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
-        inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
+                sb->s_blocksize_bits;
-        mark_inode_dirty(inode);
+        inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+        hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
 }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index dc856be3c2b0..d6857523336d 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -23,13 +23,16 @@
 #define DBG_EXTENT      0x00000020
 #define DBG_BITMAP      0x00000040
-//#define DBG_MASK      (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
+#if 0
-//#define DBG_MASK      (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+#define DBG_MASK        (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
-//#define DBG_MASK      (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#define DBG_MASK        (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+#define DBG_MASK        (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#endif
 #define DBG_MASK        (0)
 #define dprint(flg, fmt, args...) \
-        if (flg & DBG_MASK) printk(fmt , ## args)
+        if (flg & DBG_MASK) \
+                printk(fmt , ## args)
 /* Runtime config options */
 #define HFSPLUS_DEF_CR_TYPE    0x3F3F3F3F  /* '????' */
@@ -37,7 +40,8 @@
 #define HFSPLUS_TYPE_DATA 0x00
 #define HFSPLUS_TYPE_RSRC 0xFF
-typedef int (*btree_keycmp)(const hfsplus_btree_key *, const hfsplus_btree_key *);
+typedef int (*btree_keycmp)(const hfsplus_btree_key *,
+                const hfsplus_btree_key *);
 #define NODE_HASH_SIZE  256
@@ -61,8 +65,7 @@ struct hfs_btree {
        unsigned int max_key_len;
        unsigned int depth;
-        //unsigned int map1_size, map_size;
+        struct mutex tree_lock;
-        struct semaphore tree_lock;
        unsigned int pages_per_bnode;
        spinlock_t hash_lock;
@@ -107,8 +110,8 @@ struct hfsplus_vh;
 struct hfs_btree;
 struct hfsplus_sb_info {
-        struct buffer_head *s_vhbh;
        struct hfsplus_vh *s_vhdr;
+        struct hfsplus_vh *s_backup_vhdr;
        struct hfs_btree *ext_tree;
        struct hfs_btree *cat_tree;
        struct hfs_btree *attr_tree;
@@ -118,19 +121,25 @@ struct hfsplus_sb_info {
        /* Runtime variables */
        u32 blockoffset;
-        u32 sect_count;
+        sector_t part_start;
+        sector_t sect_count;
        int fs_shift;
-        /* Stuff in host order from Vol Header */
+        /* immutable data from the volume header */
        u32 alloc_blksz;
        int alloc_blksz_shift;
        u32 total_blocks;
+        u32 data_clump_blocks, rsrc_clump_blocks;
+        /* mutable data from the volume header, protected by alloc_mutex */
        u32 free_blocks;
-        u32 next_alloc;
+        struct mutex alloc_mutex;
+        /* mutable data from the volume header, protected by vh_mutex */
        u32 next_cnid;
        u32 file_count;
        u32 folder_count;
-        u32 data_clump_blocks, rsrc_clump_blocks;
+        struct mutex vh_mutex;
        /* Config options */
        u32 creator;
@@ -143,49 +152,92 @@ struct hfsplus_sb_info {
        int part, session;
        unsigned long flags;
-        struct hlist_head rsrc_inodes;
 };
-#define HFSPLUS_SB_WRITEBACKUP  0x0001
+#define HFSPLUS_SB_WRITEBACKUP  0
-#define HFSPLUS_SB_NODECOMPOSE  0x0002
+#define HFSPLUS_SB_NODECOMPOSE  1
-#define HFSPLUS_SB_FORCE        0x0004
+#define HFSPLUS_SB_FORCE        2
-#define HFSPLUS_SB_HFSX         0x0008
+#define HFSPLUS_SB_HFSX         3
-#define HFSPLUS_SB_CASEFOLD     0x0010
+#define HFSPLUS_SB_CASEFOLD     4
+#define HFSPLUS_SB_NOBARRIER    5
+static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
 struct hfsplus_inode_info {
-        struct mutex extents_lock;
+        atomic_t opencnt;
-        u32 clump_blocks, alloc_blocks;
-        sector_t fs_blocks;
+        /*
-        /* Allocation extents from catalog record or volume header */
+         * Extent allocation information, protected by extents_lock.
-        hfsplus_extent_rec first_extents;
+         */
        u32 first_blocks;
+        u32 clump_blocks;
+        u32 alloc_blocks;
+        u32 cached_start;
+        u32 cached_blocks;
+        hfsplus_extent_rec first_extents;
        hfsplus_extent_rec cached_extents;
-        u32 cached_start, cached_blocks;
+        unsigned int extent_state;
-        atomic_t opencnt;
+        struct mutex extents_lock;
+        /*
+         * Immutable data.
+         */
        struct inode *rsrc_inode;
-        unsigned long flags;
        __be32 create_date;
-        /* Device number in hfsplus_permissions in catalog */
-        u32 dev;
-        /* BSD system and user file flags */
-        u8 rootflags;
-        u8 userflags;
+        /*
+         * Protected by sbi->vh_mutex.
+         */
+        u32 linkid;
+        /*
+         * Accessed using atomic bitops.
+         */
+        unsigned long flags;
+        /*
+         * Protected by i_mutex.
+         */
+        sector_t fs_blocks;
+        u8 userflags;           /* BSD user file flags */
        struct list_head open_dir_list;
        loff_t phys_size;
        struct inode vfs_inode;
 };
-#define HFSPLUS_FLG_RSRC        0x0001
+#define HFSPLUS_EXT_DIRTY       0x0001
-#define HFSPLUS_FLG_EXT_DIRTY   0x0002
+#define HFSPLUS_EXT_NEW         0x0002
-#define HFSPLUS_FLG_EXT_NEW     0x0004
-#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC))
+#define HFSPLUS_I_RSRC          0       /* represents a resource fork */
-#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)
+#define HFSPLUS_I_CAT_DIRTY     1       /* has changes in the catalog tree */
+#define HFSPLUS_I_EXT_DIRTY     2       /* has changes in the extent tree */
+#define HFSPLUS_I_ALLOC_DIRTY   3       /* has changes in the allocation file */
+#define HFSPLUS_IS_RSRC(inode) \
+        test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
+static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
+{
+        return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
+}
+/*
+ * Mark an inode dirty, and also mark the btree in which the
+ * specific type of metadata is stored.
+ * For data or metadata that gets written back by into the catalog btree
+ * by hfsplus_write_inode a plain mark_inode_dirty call is enough.
+ */
+static inline void hfsplus_mark_inode_dirty(struct inode *inode,
+                unsigned int flag)
+{
+        set_bit(flag, &HFSPLUS_I(inode)->flags);
+        mark_inode_dirty(inode);
+}
 struct hfs_find_data {
        /* filled by caller */
@@ -303,14 +355,18 @@ int hfs_brec_read(struct hfs_find_data *, void *, int);
 int hfs_brec_goto(struct hfs_find_data *, int);
 /* catalog.c */
-int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
+int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *,
-int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
+                const hfsplus_btree_key *);
-void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *, u32, struct qstr *);
+int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *,
+                const hfsplus_btree_key *);
+void hfsplus_cat_build_key(struct super_block *sb,
+                hfsplus_btree_key *, u32, struct qstr *);
 int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *);
 int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
 int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
 int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
                       struct inode *, struct qstr *);
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
 /* dir.c */
 extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -320,7 +376,8 @@ extern const struct file_operations hfsplus_dir_operations;
 int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
 void hfsplus_ext_write_extent(struct inode *);
 int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int);
-int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int);
+int hfsplus_free_fork(struct super_block *, u32,
+                struct hfsplus_fork_raw *, int);
 int hfsplus_file_extend(struct inode *);
 void hfsplus_file_truncate(struct inode *);
@@ -335,6 +392,7 @@ int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
 int hfsplus_cat_write_inode(struct inode *);
 struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
+int hfsplus_file_fsync(struct file *file, int datasync);
 /* ioctl.c */
 long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
@@ -346,6 +404,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* options.c */
 int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
+int hfsplus_parse_options_remount(char *input, int *force);
 void hfsplus_fill_defaults(struct hfsplus_sb_info *);
 int hfsplus_show_options(struct seq_file *, struct vfsmount *);
@@ -359,56 +418,26 @@ extern u16 hfsplus_decompose_table[];
 extern u16 hfsplus_compose_table[];
 /* unicode.c */
-int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
+int hfsplus_strcasecmp(const struct hfsplus_unistr *,
-int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
+                const struct hfsplus_unistr *);
-int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
+int hfsplus_strcmp(const struct hfsplus_unistr *,
-int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
+                const struct hfsplus_unistr *);
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
+int hfsplus_uni2asc(struct super_block *,
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2);
+                const struct hfsplus_unistr *, char *, int *);
+int hfsplus_asc2uni(struct super_block *,
+                struct hfsplus_unistr *, const char *, int);
+int hfsplus_hash_dentry(const struct dentry *dentry,
+                const struct inode *inode, struct qstr *str);
+int hfsplus_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 /* wrapper.c */
 int hfsplus_read_wrapper(struct super_block *);
 int hfs_part_find(struct super_block *, sector_t *, sector_t *);
+int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
-/* access macros */
+                void *data, int rw);
-/*
-static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
-static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
-{
-        return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
-}
-*/
-#define HFSPLUS_SB(super)       (*(struct hfsplus_sb_info *)(super)->s_fs_info)
-#define HFSPLUS_I(inode)        (*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
-#if 1
-#define hfsplus_kmap(p)         ({ struct page *__p = (p); kmap(__p); })
-#define hfsplus_kunmap(p)       ({ struct page *__p = (p); kunmap(__p); __p; })
-#else
-#define hfsplus_kmap(p)         kmap(p)
-#define hfsplus_kunmap(p)       kunmap(p)
-#endif
-#define sb_bread512(sb, sec, data) ({                   \
-        struct buffer_head *__bh;                       \
-        sector_t __block;                               \
-        loff_t __start;                                 \
-        int __offset;                                   \
-                                                        \
-        __start = (loff_t)(sec) << HFSPLUS_SECTOR_SHIFT;\
-        __block = __start >> (sb)->s_blocksize_bits;    \
-        __offset = __start & ((sb)->s_blocksize - 1);   \
-        __bh = sb_bread((sb), __block);                 \
-        if (likely(__bh != NULL))                       \
-                data = (void *)(__bh->b_data + __offset);\
-        else                                            \
-                data = NULL;                            \
-        __bh;                                           \
-})
 /* time macros */
 #define __hfsp_mt2ut(t)         (be32_to_cpu(t) - 2082844800U)
@@ -419,6 +448,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 #define hfsp_ut2mt(t)           __hfsp_ut2mt((t).tv_sec)
 #define hfsp_now2mt()           __hfsp_ut2mt(get_seconds())
-#define kdev_t_to_nr(x)         (x)
 #endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61a..927cdd6d5bf5 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -36,7 +36,8 @@
 #define HFSP_WRAPOFF_EMBEDSIG     0x7C
 #define HFSP_WRAPOFF_EMBEDEXT     0x7E
-#define HFSP_HIDDENDIR_NAME     "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
+#define HFSP_HIDDENDIR_NAME \
+        "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
 #define HFSP_HARDLINK_TYPE      0x686c6e6b      /* 'hlnk' */
 #define HFSP_HFSPLUS_CREATOR    0x6866732b      /* 'hfs+' */
@@ -200,6 +201,7 @@ struct hfsplus_cat_key {
        struct hfsplus_unistr name;
 } __packed;
+#define HFSPLUS_CAT_KEYLEN      (sizeof(struct hfsplus_cat_key))
 /* Structs from hfs.h */
 struct hfsp_point {
@@ -323,7 +325,7 @@ struct hfsplus_ext_key {
        __be32 start_block;
 } __packed;
-#define HFSPLUS_EXT_KEYLEN 12
+#define HFSPLUS_EXT_KEYLEN      sizeof(struct hfsplus_ext_key)
 /* HFS+ generic BTree key */
 typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c5a979d62c65..a8df651747f0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -8,6 +8,7 @@
 * Inode handling routines
 */
+#include <linux/blkdev.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
@@ -36,7 +37,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
        *pagep = NULL;
        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                hfsplus_get_block,
-                                &HFSPLUS_I(mapping->host).phys_size);
+                                &HFSPLUS_I(mapping->host)->phys_size);
        if (unlikely(ret)) {
                loff_t isize = mapping->host->i_size;
                if (pos + len > isize)
@@ -62,13 +63,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
        switch (inode->i_ino) {
        case HFSPLUS_EXT_CNID:
-                tree = HFSPLUS_SB(sb).ext_tree;
+                tree = HFSPLUS_SB(sb)->ext_tree;
                break;
        case HFSPLUS_CAT_CNID:
-                tree = HFSPLUS_SB(sb).cat_tree;
+                tree = HFSPLUS_SB(sb)->cat_tree;
                break;
        case HFSPLUS_ATTR_CNID:
-                tree = HFSPLUS_SB(sb).attr_tree;
+                tree = HFSPLUS_SB(sb)->attr_tree;
                break;
        default:
                BUG();
@@ -77,7 +78,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
        if (!tree)
                return 0;
        if (tree->node_size >= PAGE_CACHE_SIZE) {
-                nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
+                nidx = page->index >>
+                        (tree->node_size_shift - PAGE_CACHE_SHIFT);
                spin_lock(&tree->hash_lock);
                node = hfs_bnode_findhash(tree, nidx);
                if (!node)
@@ -90,7 +92,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
                }
                spin_unlock(&tree->hash_lock);
        } else {
-                nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+                nidx = page->index <<
+                        (PAGE_CACHE_SHIFT - tree->node_size_shift);
                i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
                spin_lock(&tree->hash_lock);
                do {
@@ -166,18 +169,19 @@ const struct dentry_operations hfsplus_dentry_operations = {
        .d_compare    = hfsplus_compare_dentry,
 };
-static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry,
+static struct dentry *hfsplus_file_lookup(struct inode *dir,
-                                          struct nameidata *nd)
+                struct dentry *dentry, struct nameidata *nd)
 {
        struct hfs_find_data fd;
        struct super_block *sb = dir->i_sb;
        struct inode *inode = NULL;
+        struct hfsplus_inode_info *hip;
        int err;
        if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
                goto out;
-        inode = HFSPLUS_I(dir).rsrc_inode;
+        inode = HFSPLUS_I(dir)->rsrc_inode;
        if (inode)
                goto out;
@@ -185,12 +189,15 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
        if (!inode)
                return ERR_PTR(-ENOMEM);
+        hip = HFSPLUS_I(inode);
        inode->i_ino = dir->i_ino;
-        INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
+        INIT_LIST_HEAD(&hip->open_dir_list);
-        mutex_init(&HFSPLUS_I(inode).extents_lock);
+        mutex_init(&hip->extents_lock);
-        HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
+        hip->extent_state = 0;
+        hip->flags = 0;
+        set_bit(HFSPLUS_I_RSRC, &hip->flags);
-        hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        err = hfsplus_find_cat(sb, dir->i_ino, &fd);
        if (!err)
                err = hfsplus_cat_read_inode(inode, &fd);
@@ -199,42 +206,48 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
                iput(inode);
                return ERR_PTR(err);
        }
-        HFSPLUS_I(inode).rsrc_inode = dir;
+        hip->rsrc_inode = dir;
-        HFSPLUS_I(dir).rsrc_inode = inode;
+        HFSPLUS_I(dir)->rsrc_inode = inode;
        igrab(dir);
-        hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes);
+        /*
+         * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+         * want resource fork inodes in the regular inode space, we make them
+         * appear hashed, but do not put on any lists.  hlist_del()
+         * will work fine and require no locking.
+         */
+        hlist_add_fake(&inode->i_hash);
        mark_inode_dirty(inode);
 out:
        d_add(dentry, inode);
        return NULL;
 }
-static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
+static void hfsplus_get_perms(struct inode *inode,
+                struct hfsplus_perm *perms, int dir)
 {
-        struct super_block *sb = inode->i_sb;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
        u16 mode;
        mode = be16_to_cpu(perms->mode);
        inode->i_uid = be32_to_cpu(perms->owner);
        if (!inode->i_uid && !mode)
-                inode->i_uid = HFSPLUS_SB(sb).uid;
+                inode->i_uid = sbi->uid;
        inode->i_gid = be32_to_cpu(perms->group);
        if (!inode->i_gid && !mode)
-                inode->i_gid = HFSPLUS_SB(sb).gid;
+                inode->i_gid = sbi->gid;
        if (dir) {
-                mode = mode ? (mode & S_IALLUGO) :
+                mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
-                        (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
                mode |= S_IFDIR;
        } else if (!mode)
-                mode = S_IFREG | ((S_IRUGO|S_IWUGO) &
+                mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
-                        ~(HFSPLUS_SB(sb).umask));
        inode->i_mode = mode;
-        HFSPLUS_I(inode).rootflags = perms->rootflags;
+        HFSPLUS_I(inode)->userflags = perms->userflags;
-        HFSPLUS_I(inode).userflags = perms->userflags;
        if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
                inode->i_flags |= S_IMMUTABLE;
        else
@@ -245,30 +258,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
                inode->i_flags &= ~S_APPEND;
 }
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
-{
-        if (inode->i_flags & S_IMMUTABLE)
-                perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
-        else
-                perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-        if (inode->i_flags & S_APPEND)
-                perms->rootflags |= HFSPLUS_FLG_APPEND;
-        else
-                perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-        perms->userflags = HFSPLUS_I(inode).userflags;
-        perms->mode = cpu_to_be16(inode->i_mode);
-        perms->owner = cpu_to_be32(inode->i_uid);
-        perms->group = cpu_to_be32(inode->i_gid);
-        perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
-}
 static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
        if (HFSPLUS_IS_RSRC(inode))
-                inode = HFSPLUS_I(inode).rsrc_inode;
+                inode = HFSPLUS_I(inode)->rsrc_inode;
        if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EOVERFLOW;
-        atomic_inc(&HFSPLUS_I(inode).opencnt);
+        atomic_inc(&HFSPLUS_I(inode)->opencnt);
        return 0;
 }
@@ -277,12 +273,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
        struct super_block *sb = inode->i_sb;
        if (HFSPLUS_IS_RSRC(inode))
-                inode = HFSPLUS_I(inode).rsrc_inode;
+                inode = HFSPLUS_I(inode)->rsrc_inode;
-        if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
+        if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
                mutex_lock(&inode->i_mutex);
                hfsplus_file_truncate(inode);
                if (inode->i_flags & S_DEAD) {
-                        hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
+                        hfsplus_delete_cat(inode->i_ino,
+                                           HFSPLUS_SB(sb)->hidden_dir, NULL);
                        hfsplus_delete_inode(inode);
                }
                mutex_unlock(&inode->i_mutex);
@@ -311,29 +308,41 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
        return 0;
 }
-static int hfsplus_file_fsync(struct file *filp, int datasync)
+int hfsplus_file_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = filp->f_mapping->host;
+        struct inode *inode = file->f_mapping->host;
-        struct super_block * sb;
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
-        int ret, err;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+        int error = 0, error2;
-        /* sync the inode to buffers */
-        ret = write_inode_now(inode, 0);
+        /*
+         * Sync inode metadata into the catalog and extent trees.
-        /* sync the superblock to buffers */
+         */
-        sb = inode->i_sb;
+        sync_inode_metadata(inode, 1);
-        if (sb->s_dirt) {
-                if (!(sb->s_flags & MS_RDONLY))
+        /*
-                        hfsplus_sync_fs(sb, 1);
+         * And explicitly write out the btrees.
-                else
+         */
-                        sb->s_dirt = 0;
+        if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags))
+                error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+        if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
+                error2 =
+                        filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+                if (!error)
+                        error = error2;
        }
-        /* .. finally sync the buffers to disk */
+        if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
-        err = sync_blockdev(sb->s_bdev);
+                error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
-        if (!ret)
+                if (!error)
-                ret = err;
+                        error = error2;
-        return ret;
+        }
+        if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+        return error;
 }
 static const struct inode_operations hfsplus_file_inode_operations = {
@@ -346,7 +355,7 @@ static const struct inode_operations hfsplus_file_inode_operations = {
 };
 static const struct file_operations hfsplus_file_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .write          = do_sync_write,
@@ -361,47 +370,53 @@ static const struct file_operations hfsplus_file_operations = {
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 {
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct inode *inode = new_inode(sb);
+        struct hfsplus_inode_info *hip;
        if (!inode)
                return NULL;
-        inode->i_ino = HFSPLUS_SB(sb).next_cnid++;
+        inode->i_ino = sbi->next_cnid++;
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_nlink = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-        INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-        mutex_init(&HFSPLUS_I(inode).extents_lock);
+        hip = HFSPLUS_I(inode);
-        atomic_set(&HFSPLUS_I(inode).opencnt, 0);
+        INIT_LIST_HEAD(&hip->open_dir_list);
-        HFSPLUS_I(inode).flags = 0;
+        mutex_init(&hip->extents_lock);
-        memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
+        atomic_set(&hip->opencnt, 0);
-        memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
+        hip->extent_state = 0;
-        HFSPLUS_I(inode).alloc_blocks = 0;
+        hip->flags = 0;
-        HFSPLUS_I(inode).first_blocks = 0;
+        memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
-        HFSPLUS_I(inode).cached_start = 0;
+        memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
-        HFSPLUS_I(inode).cached_blocks = 0;
+        hip->alloc_blocks = 0;
-        HFSPLUS_I(inode).phys_size = 0;
+        hip->first_blocks = 0;
-        HFSPLUS_I(inode).fs_blocks = 0;
+        hip->cached_start = 0;
-        HFSPLUS_I(inode).rsrc_inode = NULL;
+        hip->cached_blocks = 0;
+        hip->phys_size = 0;
+        hip->fs_blocks = 0;
+        hip->rsrc_inode = NULL;
        if (S_ISDIR(inode->i_mode)) {
                inode->i_size = 2;
-                HFSPLUS_SB(sb).folder_count++;
+                sbi->folder_count++;
                inode->i_op = &hfsplus_dir_inode_operations;
                inode->i_fop = &hfsplus_dir_operations;
        } else if (S_ISREG(inode->i_mode)) {
-                HFSPLUS_SB(sb).file_count++;
+                sbi->file_count++;
                inode->i_op = &hfsplus_file_inode_operations;
                inode->i_fop = &hfsplus_file_operations;
                inode->i_mapping->a_ops = &hfsplus_aops;
-                HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks;
+                hip->clump_blocks = sbi->data_clump_blocks;
        } else if (S_ISLNK(inode->i_mode)) {
-                HFSPLUS_SB(sb).file_count++;
+                sbi->file_count++;
                inode->i_op = &page_symlink_inode_operations;
                inode->i_mapping->a_ops = &hfsplus_aops;
-                HFSPLUS_I(inode).clump_blocks = 1;
+                hip->clump_blocks = 1;
        } else
-                HFSPLUS_SB(sb).file_count++;
+                sbi->file_count++;
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
        sb->s_dirt = 1;
@@ -414,11 +429,11 @@ void hfsplus_delete_inode(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        if (S_ISDIR(inode->i_mode)) {
-                HFSPLUS_SB(sb).folder_count--;
+                HFSPLUS_SB(sb)->folder_count--;
                sb->s_dirt = 1;
                return;
        }
-        HFSPLUS_SB(sb).file_count--;
+        HFSPLUS_SB(sb)->file_count--;
        if (S_ISREG(inode->i_mode)) {
                if (!inode->i_nlink) {
                        inode->i_size = 0;
@@ -434,34 +449,40 @@ void hfsplus_delete_inode(struct inode *inode)
 void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 {
        struct super_block *sb = inode->i_sb;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        u32 count;
        int i;
-        memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents,
+        memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
-               sizeof(hfsplus_extent_rec));
        for (count = 0, i = 0; i < 8; i++)
                count += be32_to_cpu(fork->extents[i].block_count);
-        HFSPLUS_I(inode).first_blocks = count;
+        hip->first_blocks = count;
-        memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
+        memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
-        HFSPLUS_I(inode).cached_start = 0;
+        hip->cached_start = 0;
-        HFSPLUS_I(inode).cached_blocks = 0;
+        hip->cached_blocks = 0;
-        HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks);
+        hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
-        inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size);
+        hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
-        HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+        hip->fs_blocks =
-        inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
+                (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-        HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift;
+        inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
-        if (!HFSPLUS_I(inode).clump_blocks)
+        hip->clump_blocks =
-                HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks :
+                be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
-                                HFSPLUS_SB(sb).data_clump_blocks;
+        if (!hip->clump_blocks) {
+                hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
+                        sbi->rsrc_clump_blocks :
+                        sbi->data_clump_blocks;
+        }
 }
-void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
+void hfsplus_inode_write_fork(struct inode *inode,
+                struct hfsplus_fork_raw *fork)
 {
-        memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents,
+        memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
               sizeof(hfsplus_extent_rec));
        fork->total_size = cpu_to_be64(inode->i_size);
-        fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks);
+        fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
 }
 int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -472,7 +493,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
        type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
-        HFSPLUS_I(inode).dev = 0;
+        HFSPLUS_I(inode)->linkid = 0;
        if (type == HFSPLUS_FOLDER) {
                struct hfsplus_cat_folder *folder = &entry.folder;
@@ -486,8 +507,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                inode->i_atime = hfsp_mt2ut(folder->access_date);
                inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
                inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
-                HFSPLUS_I(inode).create_date = folder->create_date;
+                HFSPLUS_I(inode)->create_date = folder->create_date;
-                HFSPLUS_I(inode).fs_blocks = 0;
+                HFSPLUS_I(inode)->fs_blocks = 0;
                inode->i_op = &hfsplus_dir_inode_operations;
                inode->i_fop = &hfsplus_dir_operations;
        } else if (type == HFSPLUS_FILE) {
@@ -498,13 +519,14 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
                                        sizeof(struct hfsplus_cat_file));
-                hfsplus_inode_read_fork(inode, HFSPLUS_IS_DATA(inode) ?
+                hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
-                                        &file->data_fork : &file->rsrc_fork);
+                                        &file->rsrc_fork : &file->data_fork);
                hfsplus_get_perms(inode, &file->permissions, 0);
                inode->i_nlink = 1;
                if (S_ISREG(inode->i_mode)) {
                        if (file->permissions.dev)
-                                inode->i_nlink = be32_to_cpu(file->permissions.dev);
+                                inode->i_nlink =
+                                        be32_to_cpu(file->permissions.dev);
                        inode->i_op = &hfsplus_file_inode_operations;
                        inode->i_fop = &hfsplus_file_operations;
                        inode->i_mapping->a_ops = &hfsplus_aops;
@@ -518,7 +540,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                inode->i_atime = hfsp_mt2ut(file->access_date);
                inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
                inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
-                HFSPLUS_I(inode).create_date = file->create_date;
+                HFSPLUS_I(inode)->create_date = file->create_date;
        } else {
                printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
                res = -EIO;
@@ -533,12 +555,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
        hfsplus_cat_entry entry;
        if (HFSPLUS_IS_RSRC(inode))
-                main_inode = HFSPLUS_I(inode).rsrc_inode;
+                main_inode = HFSPLUS_I(inode)->rsrc_inode;
        if (!main_inode->i_nlink)
                return 0;
-        if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd))
+        if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
                /* panic? */
                return -EIO;
@@ -554,7 +576,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
                                        sizeof(struct hfsplus_cat_folder));
                /* simple node checks? */
-                hfsplus_set_perms(inode, &folder->permissions);
+                hfsplus_cat_set_perms(inode, &folder->permissions);
                folder->access_date = hfsp_ut2mt(inode->i_atime);
                folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
                folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -576,12 +598,10 @@ int hfsplus_cat_write_inode(struct inode *inode)
                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
                                        sizeof(struct hfsplus_cat_file));
                hfsplus_inode_write_fork(inode, &file->data_fork);
-                if (S_ISREG(inode->i_mode))
+                hfsplus_cat_set_perms(inode, &file->permissions);
-                        HFSPLUS_I(inode).dev = inode->i_nlink;
+                if (HFSPLUS_FLG_IMMUTABLE &
-                if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+                                (file->permissions.rootflags |
-                        HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
+                                        file->permissions.userflags))
-                hfsplus_set_perms(inode, &file->permissions);
-                if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
                        file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
                else
                        file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
@@ -591,6 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
                hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
                                         sizeof(struct hfsplus_cat_file));
        }
+        set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
 out:
        hfs_find_exit(&fd);
        return 0;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f099026..508ce662ce12 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
-#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
-long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 {
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+        unsigned int flags = 0;
+        if (inode->i_flags & S_IMMUTABLE)
+                flags |= FS_IMMUTABLE_FL;
+        if (inode->i_flags & S_APPEND)
+                flags |= FS_APPEND_FL;
+        if (hip->userflags & HFSPLUS_FLG_NODUMP)
+                flags |= FS_NODUMP_FL;
+        return put_user(flags, user_flags);
+}
+static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        unsigned int flags;
+        int err = 0;
-        lock_kernel();
+        err = mnt_want_write(file->f_path.mnt);
-        switch (cmd) {
+        if (err)
-        case HFSPLUS_IOC_EXT2_GETFLAGS:
+                goto out;
-                flags = 0;
-                if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
-                        flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
-                if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
-                        flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
-                if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
-                        flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
-                return put_user(flags, (int __user *)arg);
-        case HFSPLUS_IOC_EXT2_SETFLAGS: {
-                int err = 0;
-                err = mnt_want_write(filp->f_path.mnt);
-                if (err) {
-                        unlock_kernel();
-                        return err;
-                }
-                if (!is_owner_or_cap(inode)) {
+        if (!is_owner_or_cap(inode)) {
-                        err = -EACCES;
+                err = -EACCES;
-                        goto setflags_out;
+                goto out_drop_write;
-                }
+        }
-                if (get_user(flags, (int __user *)arg)) {
-                        err = -EFAULT;
-                        goto setflags_out;
-                }
-                if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
-                    HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
-                        if (!capable(CAP_LINUX_IMMUTABLE)) {
-                                err = -EPERM;
-                                goto setflags_out;
-                        }
-                }
-                /* don't silently ignore unsupported ext2 flags */
+        if (get_user(flags, user_flags)) {
-                if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+                err = -EFAULT;
-                        err = -EOPNOTSUPP;
+                goto out_drop_write;
-                        goto setflags_out;
+        }
-                }
-                if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
+        mutex_lock(&inode->i_mutex);
-                        inode->i_flags |= S_IMMUTABLE;
-                        HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
+        if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
-                } else {
+            inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
-                        inode->i_flags &= ~S_IMMUTABLE;
+                if (!capable(CAP_LINUX_IMMUTABLE)) {
-                        HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
+                        err = -EPERM;
-                }
+                        goto out_unlock_inode;
-                if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
-                        inode->i_flags |= S_APPEND;
-                        HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
-                } else {
-                        inode->i_flags &= ~S_APPEND;
-                        HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
                }
-                if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
-                        HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
-                else
-                        HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
-                inode->i_ctime = CURRENT_TIME_SEC;
-                mark_inode_dirty(inode);
-setflags_out:
-                mnt_drop_write(filp->f_path.mnt);
-                unlock_kernel();
-                return err;
        }
+        /* don't silently ignore unsupported ext2 flags */
+        if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+                err = -EOPNOTSUPP;
+                goto out_unlock_inode;
+        }
+        if (flags & FS_IMMUTABLE_FL)
+                inode->i_flags |= S_IMMUTABLE;
+        else
+                inode->i_flags &= ~S_IMMUTABLE;
+        if (flags & FS_APPEND_FL)
+                inode->i_flags |= S_APPEND;
+        else
+                inode->i_flags &= ~S_APPEND;
+        if (flags & FS_NODUMP_FL)
+                hip->userflags |= HFSPLUS_FLG_NODUMP;
+        else
+                hip->userflags &= ~HFSPLUS_FLG_NODUMP;
+        inode->i_ctime = CURRENT_TIME_SEC;
+        mark_inode_dirty(inode);
+out_unlock_inode:
+        mutex_unlock(&inode->i_mutex);
+out_drop_write:
+        mnt_drop_write(file->f_path.mnt);
+out:
+        return err;
+}
+long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        void __user *argp = (void __user *)arg;
+        switch (cmd) {
+        case HFSPLUS_IOC_EXT2_GETFLAGS:
+                return hfsplus_ioctl_getflags(file, argp);
+        case HFSPLUS_IOC_EXT2_SETFLAGS:
+                return hfsplus_ioctl_setflags(file, argp);
        default:
-                unlock_kernel();
                return -ENOTTY;
        }
 }
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
        if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
                return -EOPNOTSUPP;
-        res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+        res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
        if (res)
                return res;
        res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -132,9 +147,11 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
                        res = -ERANGE;
        } else
                res = -EOPNOTSUPP;
-        if (!res)
+        if (!res) {
                hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
                                sizeof(struct hfsplus_cat_file));
+                hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+        }
 out:
        hfs_find_exit(&fd);
        return res;
@@ -153,7 +170,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
                return -EOPNOTSUPP;
        if (size) {
-                res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+                res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
                if (res)
                        return res;
                res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +194,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
                } else
                        res = size ? -ERANGE : 4;
        } else
-                res = -ENODATA;
+                res = -EOPNOTSUPP;
 out:
        if (size)
                hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07d..bb62a5882147 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -23,6 +23,7 @@ enum {
        opt_umask, opt_uid, opt_gid,
        opt_part, opt_session, opt_nls,
        opt_nodecompose, opt_decompose,
+        opt_barrier, opt_nobarrier,
        opt_force, opt_err
 };
@@ -37,6 +38,8 @@ static const match_table_t tokens = {
        { opt_nls, "nls=%s" },
        { opt_decompose, "decompose" },
        { opt_nodecompose, "nodecompose" },
+        { opt_barrier, "barrier" },
+        { opt_nobarrier, "nobarrier" },
        { opt_force, "force" },
        { opt_err, NULL }
 };
@@ -65,6 +68,32 @@ static inline int match_fourchar(substring_t *arg, u32 *result)
        return 0;
 }
+int hfsplus_parse_options_remount(char *input, int *force)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int token;
+        if (!input)
+                return 0;
+        while ((p = strsep(&input, ",")) != NULL) {
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case opt_force:
+                        *force = 1;
+                        break;
+                default:
+                        break;
+                }
+        }
+        return 1;
+}
 /* Parse options from mount. Returns 0 on failure */
 /* input is the options passed to mount() as a string */
 int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
@@ -136,20 +165,28 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
                        if (p)
                                sbi->nls = load_nls(p);
                        if (!sbi->nls) {
-                                printk(KERN_ERR "hfs: unable to load nls mapping \"%s\"\n", p);
+                                printk(KERN_ERR "hfs: unable to load "
+                                                "nls mapping \"%s\"\n",
+                                        p);
                                kfree(p);
                                return 0;
                        }
                        kfree(p);
                        break;
                case opt_decompose:
-                        sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE;
+                        clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
                        break;
                case opt_nodecompose:
-                        sbi->flags |= HFSPLUS_SB_NODECOMPOSE;
+                        set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
+                        break;
+                case opt_barrier:
+                        clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+                        break;
+                case opt_nobarrier:
+                        set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
                        break;
                case opt_force:
-                        sbi->flags |= HFSPLUS_SB_FORCE;
+                        set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
                        break;
                default:
                        return 0;
@@ -171,20 +208,23 @@ done:
 int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
 {
-        struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb);
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
        if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
                seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
        if (sbi->type != HFSPLUS_DEF_CR_TYPE)
                seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
-        seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid);
+        seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
+                sbi->uid, sbi->gid);
        if (sbi->part >= 0)
                seq_printf(seq, ",part=%u", sbi->part);
        if (sbi->session >= 0)
                seq_printf(seq, ",session=%u", sbi->session);
        if (sbi->nls)
                seq_printf(seq, ",nls=%s", sbi->nls->charset);
-        if (sbi->flags & HFSPLUS_SB_NODECOMPOSE)
+        if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
                seq_printf(seq, ",nodecompose");
+        if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+                seq_printf(seq, ",nobarrier");
        return 0;
 }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd0299..40ad88c12c64 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -2,7 +2,8 @@
 * linux/fs/hfsplus/part_tbl.c
 *
 * Copyright (C) 1996-1997  Paul H. Hargrove
- * This file may be distributed under the terms of the GNU General Public License.
+ * This file may be distributed under the terms of
+ * the GNU General Public License.
 *
 * Original code to handle the new style Mac partition table based on
 * a patch contributed by Holger Schemel (aeglos@valinor.owl.de).
@@ -13,6 +14,7 @@
 *
 */
+#include <linux/slab.h>
 #include "hfsplus_fs.h"
 /* offsets to various blocks */
@@ -58,76 +60,94 @@ struct new_pmap {
 */
 struct old_pmap {
        __be16          pdSig;  /* Signature bytes */
-        struct  old_pmap_entry {
+        struct old_pmap_entry {
                __be32  pdStart;
                __be32  pdSize;
                __be32  pdFSID;
        }       pdEntry[42];
 } __packed;
+static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm,
+                sector_t *part_start, sector_t *part_size)
+{
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        int i;
+        for (i = 0; i < 42; i++) {
+                struct old_pmap_entry *p = &pm->pdEntry[i];
+                if (p->pdStart && p->pdSize &&
+                    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
+                    (sbi->part < 0 || sbi->part == i)) {
+                        *part_start += be32_to_cpu(p->pdStart);
+                        *part_size = be32_to_cpu(p->pdSize);
+                        return 0;
+                }
+        }
+        return -ENOENT;
+}
+static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm,
+                sector_t *part_start, sector_t *part_size)
+{
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        int size = be32_to_cpu(pm->pmMapBlkCnt);
+        int res;
+        int i = 0;
+        do {
+                if (!memcmp(pm->pmPartType, "Apple_HFS", 9) &&
+                    (sbi->part < 0 || sbi->part == i)) {
+                        *part_start += be32_to_cpu(pm->pmPyPartStart);
+                        *part_size = be32_to_cpu(pm->pmPartBlkCnt);
+                        return 0;
+                }
+                if (++i >= size)
+                        return -ENOENT;
+                res = hfsplus_submit_bio(sb->s_bdev,
+                                         *part_start + HFS_PMAP_BLK + i,
+                                         pm, READ);
+                if (res)
+                        return res;
+        } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC));
+        return -ENOENT;
+}
 /*
- * hfs_part_find()
+ * Parse the partition map looking for the start and length of a
- *
+ * HFS/HFS+ partition.
- * Parse the partition map looking for the
- * start and length of the 'part'th HFS partition.
 */
 int hfs_part_find(struct super_block *sb,
-                  sector_t *part_start, sector_t *part_size)
+                sector_t *part_start, sector_t *part_size)
 {
-        struct buffer_head *bh;
+        void *data;
-        __be16 *data;
+        int res;
-        int i, size, res;
-        res = -ENOENT;
+        data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
-        bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data);
+        if (!data)
-        if (!bh)
+                return -ENOMEM;
-                return -EIO;
-        switch (be16_to_cpu(*data)) {
+        res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
+                                 data, READ);
+        if (res)
+                goto out;
+        switch (be16_to_cpu(*((__be16 *)data))) {
        case HFS_OLD_PMAP_MAGIC:
-          {
+                res = hfs_parse_old_pmap(sb, data, part_start, part_size);
-                struct old_pmap *pm;
-                struct old_pmap_entry *p;
-                pm = (struct old_pmap *)bh->b_data;
-                p = pm->pdEntry;
-                size = 42;
-                for (i = 0; i < size; p++, i++) {
-                        if (p->pdStart && p->pdSize &&
-                            p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
-                            (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
-                                *part_start += be32_to_cpu(p->pdStart);
-                                *part_size = be32_to_cpu(p->pdSize);
-                                res = 0;
-                        }
-                }
                break;
-          }
        case HFS_NEW_PMAP_MAGIC:
-          {
+                res = hfs_parse_new_pmap(sb, data, part_start, part_size);
-                struct new_pmap *pm;
+                break;
+        default:
-                pm = (struct new_pmap *)bh->b_data;
+                res = -ENOENT;
-                size = be32_to_cpu(pm->pmMapBlkCnt);
-                for (i = 0; i < size;) {
-                        if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
-                            (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
-                                *part_start += be32_to_cpu(pm->pmPyPartStart);
-                                *part_size = be32_to_cpu(pm->pmPartBlkCnt);
-                                res = 0;
-                                break;
-                        }
-                        brelse(bh);
-                        bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm);
-                        if (!bh)
-                                return -EIO;
-                        if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC))
-                                break;
-                }
                break;
-          }
        }
-        brelse(bh);
+out:
+        kfree(data);
        return res;
 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3b55c050c742..b49b55584c84 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -10,9 +10,9 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/blkdev.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/nls.h>
@@ -21,40 +21,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
 #include "hfsplus_fs.h"
-struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+static int hfsplus_system_read_inode(struct inode *inode)
 {
-        struct hfs_find_data fd;
+        struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
-        struct hfsplus_vh *vhdr;
-        struct inode *inode;
-        long err = -EIO;
-        inode = iget_locked(sb, ino);
+        switch (inode->i_ino) {
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
-        if (!(inode->i_state & I_NEW))
-                return inode;
-        INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-        mutex_init(&HFSPLUS_I(inode).extents_lock);
-        HFSPLUS_I(inode).flags = 0;
-        HFSPLUS_I(inode).rsrc_inode = NULL;
-        atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-        if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-        read_inode:
-                hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
-                err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
-                if (!err)
-                        err = hfsplus_cat_read_inode(inode, &fd);
-                hfs_find_exit(&fd);
-                if (err)
-                        goto bad_inode;
-                goto done;
-        }
-        vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
-        switch(inode->i_ino) {
-        case HFSPLUS_ROOT_CNID:
-                goto read_inode;
        case HFSPLUS_EXT_CNID:
                hfsplus_inode_read_fork(inode, &vhdr->ext_file);
                inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,74 +46,102 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
                inode->i_mapping->a_ops = &hfsplus_btree_aops;
                break;
        default:
-                goto bad_inode;
+                return -EIO;
+        }
+        return 0;
+}
+struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+{
+        struct hfs_find_data fd;
+        struct inode *inode;
+        int err;
+        inode = iget_locked(sb, ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+        mutex_init(&HFSPLUS_I(inode)->extents_lock);
+        HFSPLUS_I(inode)->flags = 0;
+        HFSPLUS_I(inode)->extent_state = 0;
+        HFSPLUS_I(inode)->rsrc_inode = NULL;
+        atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+        if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+            inode->i_ino == HFSPLUS_ROOT_CNID) {
+                hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+                err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+                if (!err)
+                        err = hfsplus_cat_read_inode(inode, &fd);
+                hfs_find_exit(&fd);
+        } else {
+                err = hfsplus_system_read_inode(inode);
+        }
+        if (err) {
+                iget_failed(inode);
+                return ERR_PTR(err);
        }
-done:
        unlock_new_inode(inode);
        return inode;
-bad_inode:
-        iget_failed(inode);
-        return ERR_PTR(err);
 }
-static int hfsplus_write_inode(struct inode *inode,
+static int hfsplus_system_write_inode(struct inode *inode)
-                struct writeback_control *wbc)
 {
-        struct hfsplus_vh *vhdr;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
-        int ret = 0;
+        struct hfsplus_vh *vhdr = sbi->s_vhdr;
+        struct hfsplus_fork_raw *fork;
+        struct hfs_btree *tree = NULL;
-        dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
-        hfsplus_ext_write_extent(inode);
-        if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-                return hfsplus_cat_write_inode(inode);
-        }
-        vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
        switch (inode->i_ino) {
-        case HFSPLUS_ROOT_CNID:
-                ret = hfsplus_cat_write_inode(inode);
-                break;
        case HFSPLUS_EXT_CNID:
-                if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) {
+                fork = &vhdr->ext_file;
-                        HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
+                tree = sbi->ext_tree;
-                        inode->i_sb->s_dirt = 1;
-                }
-                hfsplus_inode_write_fork(inode, &vhdr->ext_file);
-                hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
                break;
        case HFSPLUS_CAT_CNID:
-                if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) {
+                fork = &vhdr->cat_file;
-                        HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
+                tree = sbi->cat_tree;
-                        inode->i_sb->s_dirt = 1;
-                }
-                hfsplus_inode_write_fork(inode, &vhdr->cat_file);
-                hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
                break;
        case HFSPLUS_ALLOC_CNID:
-                if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) {
+                fork = &vhdr->alloc_file;
-                        HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                        inode->i_sb->s_dirt = 1;
-                }
-                hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
                break;
        case HFSPLUS_START_CNID:
-                if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) {
+                fork = &vhdr->start_file;
-                        HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                        inode->i_sb->s_dirt = 1;
-                }
-                hfsplus_inode_write_fork(inode, &vhdr->start_file);
                break;
        case HFSPLUS_ATTR_CNID:
-                if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) {
+                fork = &vhdr->attr_file;
-                        HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
+                tree = sbi->attr_tree;
-                        inode->i_sb->s_dirt = 1;
+        default:
-                }
+                return -EIO;
-                hfsplus_inode_write_fork(inode, &vhdr->attr_file);
+        }
-                hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree);
-                break;
+        if (fork->total_size != cpu_to_be64(inode->i_size)) {
+                set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
+                inode->i_sb->s_dirt = 1;
        }
-        return ret;
+        hfsplus_inode_write_fork(inode, fork);
+        if (tree)
+                hfs_btree_write(tree);
+        return 0;
+}
+static int hfsplus_write_inode(struct inode *inode,
+                struct writeback_control *wbc)
+{
+        dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+        hfsplus_ext_write_extent(inode);
+        if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+            inode->i_ino == HFSPLUS_ROOT_CNID)
+                return hfsplus_cat_write_inode(inode);
+        else
+                return hfsplus_system_write_inode(inode);
 }
 static void hfsplus_evict_inode(struct inode *inode)
@@ -151,52 +150,74 @@ static void hfsplus_evict_inode(struct inode *inode)
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
        if (HFSPLUS_IS_RSRC(inode)) {
-                HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
+                HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
-                iput(HFSPLUS_I(inode).rsrc_inode);
+                iput(HFSPLUS_I(inode)->rsrc_inode);
        }
 }
 int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
-        struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        struct hfsplus_vh *vhdr = sbi->s_vhdr;
+        int write_backup = 0;
+        int error, error2;
+        if (!wait)
+                return 0;
        dprint(DBG_SUPER, "hfsplus_write_super\n");
-        lock_super(sb);
        sb->s_dirt = 0;
-        vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
+        /*
-        vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
+         * Explicitly write out the special metadata inodes.
-        vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid);
+         *
-        vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count);
+         * While these special inodes are marked as hashed and written
-        vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
+         * out peridocically by the flusher threads we redirty them
+         * during writeout of normal inodes, and thus the life lock
-        mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
+         * prevents us from getting the latest state to disk.
-        if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) {
+         */
-                if (HFSPLUS_SB(sb).sect_count) {
+        error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
-                        struct buffer_head *bh;
+        error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
-                        u32 block, offset;
+        if (!error)
+                error = error2;
-                        block = HFSPLUS_SB(sb).blockoffset;
+        error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
-                        block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9);
+        if (!error)
-                        offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1);
+                error = error2;
-                        printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset,
-                                HFSPLUS_SB(sb).sect_count, block, offset);
+        mutex_lock(&sbi->vh_mutex);
-                        bh = sb_bread(sb, block);
+        mutex_lock(&sbi->alloc_mutex);
-                        if (bh) {
+        vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
-                                vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
+        vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
-                                if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
+        vhdr->folder_count = cpu_to_be32(sbi->folder_count);
-                                        memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr));
+        vhdr->file_count = cpu_to_be32(sbi->file_count);
-                                        mark_buffer_dirty(bh);
-                                        brelse(bh);
+        if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
-                                } else
+                memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
-                                        printk(KERN_WARNING "hfs: backup not found!\n");
+                write_backup = 1;
-                        }
-                }
-                HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
        }
-        unlock_super(sb);
-        return 0;
+        error2 = hfsplus_submit_bio(sb->s_bdev,
+                                   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
+                                   sbi->s_vhdr, WRITE_SYNC);
+        if (!error)
+                error = error2;
+        if (!write_backup)
+                goto out;
+        error2 = hfsplus_submit_bio(sb->s_bdev,
+                                  sbi->part_start + sbi->sect_count - 2,
+                                  sbi->s_backup_vhdr, WRITE_SYNC);
+        if (!error)
+                error2 = error;
+out:
+        mutex_unlock(&sbi->alloc_mutex);
+        mutex_unlock(&sbi->vh_mutex);
+        if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+                blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+        return error;
 }
 static void hfsplus_write_super(struct super_block *sb)
@@ -209,48 +230,47 @@ static void hfsplus_write_super(struct super_block *sb)
 static void hfsplus_put_super(struct super_block *sb)
 {
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        dprint(DBG_SUPER, "hfsplus_put_super\n");
        if (!sb->s_fs_info)
                return;
-        lock_kernel();
+        if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
+                struct hfsplus_vh *vhdr = sbi->s_vhdr;
-        if (sb->s_dirt)
-                hfsplus_write_super(sb);
-        if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
-                struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
                vhdr->modify_date = hfsp_now2mt();
                vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
                vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
-                mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-                sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+                hfsplus_sync_fs(sb, 1);
        }
-        hfs_btree_close(HFSPLUS_SB(sb).cat_tree);
+        hfs_btree_close(sbi->cat_tree);
-        hfs_btree_close(HFSPLUS_SB(sb).ext_tree);
+        hfs_btree_close(sbi->ext_tree);
-        iput(HFSPLUS_SB(sb).alloc_file);
+        iput(sbi->alloc_file);
-        iput(HFSPLUS_SB(sb).hidden_dir);
+        iput(sbi->hidden_dir);
-        brelse(HFSPLUS_SB(sb).s_vhbh);
+        kfree(sbi->s_vhdr);
-        unload_nls(HFSPLUS_SB(sb).nls);
+        kfree(sbi->s_backup_vhdr);
+        unload_nls(sbi->nls);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = HFSPLUS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
-        buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
+        buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
-        buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift;
+        buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
        buf->f_bavail = buf->f_bfree;
        buf->f_files = 0xFFFFFFFF;
-        buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+        buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -263,27 +283,32 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
        if (!(*flags & MS_RDONLY)) {
-                struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+                struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
-                struct hfsplus_sb_info sbi;
+                int force = 0;
-                memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
+                if (!hfsplus_parse_options_remount(data, &force))
-                sbi.nls = HFSPLUS_SB(sb).nls;
-                if (!hfsplus_parse_options(data, &sbi))
                        return -EINVAL;
                if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
-                        printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
+                        printk(KERN_WARNING "hfs: filesystem was "
-                               "running fsck.hfsplus is recommended.  leaving read-only.\n");
+                                        "not cleanly unmounted, "
+                                        "running fsck.hfsplus is recommended.  "
+                                        "leaving read-only.\n");
                        sb->s_flags |= MS_RDONLY;
                        *flags |= MS_RDONLY;
-                } else if (sbi.flags & HFSPLUS_SB_FORCE) {
+                } else if (force) {
                        /* nothing */
-                } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+                } else if (vhdr->attributes &
-                        printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
+                                cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+                        printk(KERN_WARNING "hfs: filesystem is marked locked, "
+                                        "leaving read-only.\n");
                        sb->s_flags |= MS_RDONLY;
                        *flags |= MS_RDONLY;
-                } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+                } else if (vhdr->attributes &
-                        printk(KERN_WARNING "hfs: filesystem is marked journaled, leaving read-only.\n");
+                                cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+                        printk(KERN_WARNING "hfs: filesystem is "
+                                        "marked journaled, "
+                                        "leaving read-only.\n");
                        sb->s_flags |= MS_RDONLY;
                        *flags |= MS_RDONLY;
                }
@@ -313,19 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root, *inode;
        struct qstr str;
        struct nls_table *nls = NULL;
-        int err = -EINVAL;
+        int err;
+        err = -EINVAL;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
-                return -ENOMEM;
+                goto out;
        sb->s_fs_info = sbi;
-        INIT_HLIST_HEAD(&sbi->rsrc_inodes);
+        mutex_init(&sbi->alloc_mutex);
+        mutex_init(&sbi->vh_mutex);
        hfsplus_fill_defaults(sbi);
+        err = -EINVAL;
        if (!hfsplus_parse_options(data, sbi)) {
                printk(KERN_ERR "hfs: unable to parse mount options\n");
-                err = -EINVAL;
+                goto out_unload_nls;
-                goto cleanup;
        }
        /* temporarily use utf8 to correctly find the hidden dir below */
@@ -333,140 +361,160 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        sbi->nls = load_nls("utf8");
        if (!sbi->nls) {
                printk(KERN_ERR "hfs: unable to load nls for utf8\n");
-                err = -EINVAL;
+                goto out_unload_nls;
-                goto cleanup;
        }
        /* Grab the volume header */
        if (hfsplus_read_wrapper(sb)) {
                if (!silent)
                        printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
-                err = -EINVAL;
+                goto out_unload_nls;
-                goto cleanup;
        }
-        vhdr = HFSPLUS_SB(sb).s_vhdr;
+        vhdr = sbi->s_vhdr;
        /* Copy parts of the volume header into the superblock */
        sb->s_magic = HFSPLUS_VOLHEAD_SIG;
        if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
            be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
                printk(KERN_ERR "hfs: wrong filesystem version\n");
-                goto cleanup;
+                goto out_free_vhdr;
        }
-        HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks);
+        sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
-        HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks);
+        sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
-        HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc);
+        sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
-        HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid);
+        sbi->file_count = be32_to_cpu(vhdr->file_count);
-        HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count);
+        sbi->folder_count = be32_to_cpu(vhdr->folder_count);
-        HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count);
+        sbi->data_clump_blocks =
-        HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
+                be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
-        if (!HFSPLUS_SB(sb).data_clump_blocks)
+        if (!sbi->data_clump_blocks)
-                HFSPLUS_SB(sb).data_clump_blocks = 1;
+                sbi->data_clump_blocks = 1;
-        HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
+        sbi->rsrc_clump_blocks =
-        if (!HFSPLUS_SB(sb).rsrc_clump_blocks)
+                be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
-                HFSPLUS_SB(sb).rsrc_clump_blocks = 1;
+        if (!sbi->rsrc_clump_blocks)
+                sbi->rsrc_clump_blocks = 1;
        /* Set up operations so we can load metadata */
        sb->s_op = &hfsplus_sops;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
-                printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
+                printk(KERN_WARNING "hfs: Filesystem was "
-                       "running fsck.hfsplus is recommended.  mounting read-only.\n");
+                                "not cleanly unmounted, "
+                                "running fsck.hfsplus is recommended.  "
+                                "mounting read-only.\n");
                sb->s_flags |= MS_RDONLY;
-        } else if (sbi->flags & HFSPLUS_SB_FORCE) {
+        } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
                /* nothing */
        } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
                printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
                sb->s_flags |= MS_RDONLY;
-        } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) {
+        } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
-                printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, "
+                        !(sb->s_flags & MS_RDONLY)) {
-                       "use the force option at your own risk, mounting read-only.\n");
+                printk(KERN_WARNING "hfs: write access to "
+                                "a journaled filesystem is not supported, "
+                                "use the force option at your own risk, "
+                                "mounting read-only.\n");
                sb->s_flags |= MS_RDONLY;
        }
-        sbi->flags &= ~HFSPLUS_SB_FORCE;
        /* Load metadata objects (B*Trees) */
-        HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
+        sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
-        if (!HFSPLUS_SB(sb).ext_tree) {
+        if (!sbi->ext_tree) {
                printk(KERN_ERR "hfs: failed to load extents file\n");
-                goto cleanup;
+                goto out_free_vhdr;
        }
-        HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
+        sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
-        if (!HFSPLUS_SB(sb).cat_tree) {
+        if (!sbi->cat_tree) {
                printk(KERN_ERR "hfs: failed to load catalog file\n");
-                goto cleanup;
+                goto out_close_ext_tree;
        }
        inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
        if (IS_ERR(inode)) {
                printk(KERN_ERR "hfs: failed to load allocation file\n");
                err = PTR_ERR(inode);
-                goto cleanup;
+                goto out_close_cat_tree;
        }
-        HFSPLUS_SB(sb).alloc_file = inode;
+        sbi->alloc_file = inode;
        /* Load the root directory */
        root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
        if (IS_ERR(root)) {
                printk(KERN_ERR "hfs: failed to load root directory\n");
                err = PTR_ERR(root);
-                goto cleanup;
+                goto out_put_alloc_file;
        }
-        sb->s_root = d_alloc_root(root);
-        if (!sb->s_root) {
-                iput(root);
-                err = -ENOMEM;
-                goto cleanup;
-        }
-        sb->s_root->d_op = &hfsplus_dentry_operations;
        str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
        str.name = HFSP_HIDDENDIR_NAME;
-        hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+        hfs_find_init(sbi->cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
        if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
                hfs_find_exit(&fd);
                if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
-                        goto cleanup;
+                        goto out_put_root;
                inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
-                        goto cleanup;
+                        goto out_put_root;
                }
-                HFSPLUS_SB(sb).hidden_dir = inode;
+                sbi->hidden_dir = inode;
        } else
                hfs_find_exit(&fd);
-        if (sb->s_flags & MS_RDONLY)
+        if (!(sb->s_flags & MS_RDONLY)) {
-                goto out;
+                /*
+                 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
+                 * all three are registered with Apple for our use
+                 */
+                vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
+                vhdr->modify_date = hfsp_now2mt();
+                be32_add_cpu(&vhdr->write_count, 1);
+                vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
+                vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
+                hfsplus_sync_fs(sb, 1);
-        /* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
+                if (!sbi->hidden_dir) {
-         * all three are registered with Apple for our use
+                        mutex_lock(&sbi->vh_mutex);
-         */
+                        sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-        vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
+                        hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
-        vhdr->modify_date = hfsp_now2mt();
+                                           sbi->hidden_dir);
-        be32_add_cpu(&vhdr->write_count, 1);
+                        mutex_unlock(&sbi->vh_mutex);
-        vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
-        vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
+                        hfsplus_mark_inode_dirty(sbi->hidden_dir,
-        mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
+                                                 HFSPLUS_I_CAT_DIRTY);
-        sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+                }
-        if (!HFSPLUS_SB(sb).hidden_dir) {
-                printk(KERN_DEBUG "hfs: create hidden dir...\n");
-                HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-                hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode,
-                                   &str, HFSPLUS_SB(sb).hidden_dir);
-                mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir);
        }
-out:
+        sb->s_d_op = &hfsplus_dentry_operations;
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                err = -ENOMEM;
+                goto out_put_hidden_dir;
+        }
        unload_nls(sbi->nls);
        sbi->nls = nls;
        return 0;
-cleanup:
+out_put_hidden_dir:
-        hfsplus_put_super(sb);
+        iput(sbi->hidden_dir);
+out_put_root:
+        iput(sbi->alloc_file);
+out_put_alloc_file:
+        iput(sbi->alloc_file);
+out_close_cat_tree:
+        hfs_btree_close(sbi->cat_tree);
+out_close_ext_tree:
+        hfs_btree_close(sbi->ext_tree);
+out_free_vhdr:
+        kfree(sbi->s_vhdr);
+        kfree(sbi->s_backup_vhdr);
+out_unload_nls:
+        unload_nls(sbi->nls);
        unload_nls(nls);
+        kfree(sbi);
+out:
        return err;
 }
@@ -484,25 +532,31 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
        return i ? &i->vfs_inode : NULL;
 }
+static void hfsplus_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
+}
 static void hfsplus_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode));
+        call_rcu(&inode->i_rcu, hfsplus_i_callback);
 }
 #define HFSPLUS_INODE_SIZE      sizeof(struct hfsplus_inode_info)
-static int hfsplus_get_sb(struct file_system_type *fs_type,
+static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
-                          int flags, const char *dev_name, void *data,
+                          int flags, const char *dev_name, void *data)
-                          struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
-                           mnt);
 }
 static struct file_system_type hfsplus_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "hfsplus",
-        .get_sb         = hfsplus_get_sb,
+        .mount          = hfsplus_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa402..a3f0bfcc881e 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -17,14 +17,14 @@
 /* Returns folded char, or 0 if ignorable */
 static inline u16 case_fold(u16 c)
 {
-        u16 tmp;
+        u16 tmp;
-        tmp = hfsplus_case_fold_table[c >> 8];
+        tmp = hfsplus_case_fold_table[c >> 8];
-        if (tmp)
+        if (tmp)
-                tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
+                tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
-        else
+        else
-                tmp = c;
+                tmp = c;
-        return tmp;
+        return tmp;
 }
 /* Compare unicode strings, return values like normal strcmp */
@@ -118,10 +118,12 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
        return NULL;
 }
-int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
+int hfsplus_uni2asc(struct super_block *sb,
+                const struct hfsplus_unistr *ustr,
+                char *astr, int *len_p)
 {
        const hfsplus_unichr *ip;
-        struct nls_table *nls = HFSPLUS_SB(sb).nls;
+        struct nls_table *nls = HFSPLUS_SB(sb)->nls;
        u8 *op;
        u16 cc, c0, c1;
        u16 *ce1, *ce2;
@@ -132,7 +134,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
        ustrlen = be16_to_cpu(ustr->length);
        len = *len_p;
        ce1 = NULL;
-        compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+        compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
        while (ustrlen > 0) {
                c0 = be16_to_cpu(*ip++);
@@ -171,7 +173,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                                goto same;
                        c1 = be16_to_cpu(*ip);
                        if (likely(compose))
-                                ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c1);
+                                ce1 = hfsplus_compose_lookup(
+                                        hfsplus_compose_table, c1);
                        if (ce1)
                                break;
                        switch (c0) {
@@ -199,7 +202,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                if (ce2) {
                        i = 1;
                        while (i < ustrlen) {
-                                ce1 = hfsplus_compose_lookup(ce2, be16_to_cpu(ip[i]));
+                                ce1 = hfsplus_compose_lookup(ce2,
+                                        be16_to_cpu(ip[i]));
                                if (!ce1)
                                        break;
                                i++;
@@ -211,7 +215,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                                goto done;
                        }
                }
-        same:
+same:
                switch (c0) {
                case 0:
                        cc = 0x2400;
@@ -222,7 +226,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                default:
                        cc = c0;
                }
-        done:
+done:
                res = nls->uni2char(cc, op, len);
                if (res < 0) {
                        if (res == -ENAMETOOLONG)
@@ -246,7 +250,7 @@ out:
 static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
                              wchar_t *uc)
 {
-        int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc);
+        int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
        if (size <= 0) {
                *uc = '?';
                size = 1;
@@ -293,7 +297,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
        u16 *dstr, outlen = 0;
        wchar_t c;
-        decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+        decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
        while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
                size = asc2unichar(sb, astr, len, &c);
@@ -320,7 +324,8 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 * Composed unicode characters are decomposed and case-folding is performed
 * if the appropriate bits are (un)set on the superblock.
 */
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
+int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *str)
 {
        struct super_block *sb = dentry->d_sb;
        const char *astr;
@@ -330,8 +335,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
        wchar_t c;
        u16 c2;
-        casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
+        casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
-        decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+        decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
        hash = init_name_hash();
        astr = str->name;
        len = str->len;
@@ -363,9 +368,12 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
 * Composed unicode characters are decomposed and case-folding is performed
 * if the appropriate bits are (un)set on the superblock.
 */
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfsplus_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct super_block *sb = dentry->d_sb;
+        struct super_block *sb = parent->d_sb;
        int casefold, decompose, size;
        int dsize1, dsize2, len1, len2;
        const u16 *dstr1, *dstr2;
@@ -373,12 +381,12 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
        u16 c1, c2;
        wchar_t c;
-        casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
+        casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
-        decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+        decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
-        astr1 = s1->name;
+        astr1 = str;
-        len1 = s1->len;
+        len1 = len;
-        astr2 = s2->name;
+        astr2 = name->name;
-        len2 = s2->len;
+        len2 = name->len;
        dsize1 = dsize2 = 0;
        dstr1 = dstr2 = NULL;
@@ -388,7 +396,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
                        astr1 += size;
                        len1 -= size;
-                        if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) {
+                        if (decompose)
+                                dstr1 = decompose_unichar(c, &dsize1);
+                        if (!decompose || !dstr1) {
                                c1 = c;
                                dstr1 = &c1;
                                dsize1 = 1;
@@ -400,7 +410,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
                        astr2 += size;
                        len2 -= size;
-                        if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) {
+                        if (decompose)
+                                dstr2 = decompose_unichar(c, &dsize2);
+                        if (!decompose || !dstr2) {
                                c2 = c;
                                dstr2 = &c2;
                                dsize2 = 1;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d1..3031d81f5f0f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,6 +24,40 @@ struct hfsplus_wd {
        u16 embed_count;
 };
+static void hfsplus_end_io_sync(struct bio *bio, int err)
+{
+        if (err)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        complete(bio->bi_private);
+}
+int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
+                void *data, int rw)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        struct bio *bio;
+        bio = bio_alloc(GFP_NOIO, 1);
+        bio->bi_sector = sector;
+        bio->bi_bdev = bdev;
+        bio->bi_end_io = hfsplus_end_io_sync;
+        bio->bi_private = &wait;
+        /*
+         * We always submit one sector at a time, so bio_add_page must not fail.
+         */
+        if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE,
+                         offset_in_page(data)) != HFSPLUS_SECTOR_SIZE)
+                BUG();
+        submit_bio(rw, bio);
+        wait_for_completion(&wait);
+        if (!bio_flagged(bio, BIO_UPTODATE))
+                return -EIO;
+        return 0;
+}
 static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
 {
        u32 extent;
@@ -40,12 +74,14 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
           !(attrib & HFSP_WRAP_ATTRIB_SPARED))
                return 0;
-        wd->ablk_size = be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
+        wd->ablk_size =
+                be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
        if (wd->ablk_size < HFSPLUS_SECTOR_SIZE)
                return 0;
        if (wd->ablk_size % HFSPLUS_SECTOR_SIZE)
                return 0;
-        wd->ablk_start = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
+        wd->ablk_start =
+                be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
        extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT);
        wd->embed_start = (extent >> 16) & 0xFFFF;
@@ -65,10 +101,11 @@ static int hfsplus_get_last_session(struct super_block *sb,
        *start = 0;
        *size = sb->s_bdev->bd_inode->i_size >> 9;
-        if (HFSPLUS_SB(sb).session >= 0) {
+        if (HFSPLUS_SB(sb)->session >= 0) {
-                te.cdte_track = HFSPLUS_SB(sb).session;
+                te.cdte_track = HFSPLUS_SB(sb)->session;
                te.cdte_format = CDROM_LBA;
-                res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
+                res = ioctl_by_bdev(sb->s_bdev,
+                        CDROMREADTOCENTRY, (unsigned long)&te);
                if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
                        *start = (sector_t)te.cdte_addr.lba << 2;
                        return 0;
@@ -77,7 +114,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
                return -EINVAL;
        }
        ms_info.addr_format = CDROM_LBA;
-        res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
+        res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION,
+                (unsigned long)&ms_info);
        if (!res && ms_info.xa_flag)
                *start = (sector_t)ms_info.addr.lba << 2;
        return 0;
@@ -87,97 +125,113 @@ static int hfsplus_get_last_session(struct super_block *sb,
 /* Takes in super block, returns true if good data read */
 int hfsplus_read_wrapper(struct super_block *sb)
 {
-        struct buffer_head *bh;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
-        struct hfsplus_vh *vhdr;
        struct hfsplus_wd wd;
        sector_t part_start, part_size;
        u32 blocksize;
+        int error = 0;
+        error = -EINVAL;
        blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE);
        if (!blocksize)
-                return -EINVAL;
+                goto out;
        if (hfsplus_get_last_session(sb, &part_start, &part_size))
-                return -EINVAL;
+                goto out;
        if ((u64)part_start + part_size > 0x100000000ULL) {
                pr_err("hfs: volumes larger than 2TB are not supported yet\n");
-                return -EINVAL;
+                goto out;
        }
-        while (1) {
-                bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-                if (!bh)
-                        return -EIO;
-                if (vhdr->signature == cpu_to_be16(HFSP_WRAP_MAGIC)) {
-                        if (!hfsplus_read_mdb(vhdr, &wd))
-                                goto error;
-                        wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
-                        part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
-                        part_size = wd.embed_count * wd.ablk_size;
-                        brelse(bh);
-                        bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-                        if (!bh)
-                                return -EIO;
-                }
-                if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
-                        break;
-                if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
-                        HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX;
-                        break;
-                }
-                brelse(bh);
-                /* check for a partition block
+        error = -ENOMEM;
+        sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+        if (!sbi->s_vhdr)
+                goto out;
+        sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+        if (!sbi->s_backup_vhdr)
+                goto out_free_vhdr;
+reread:
+        error = hfsplus_submit_bio(sb->s_bdev,
+                                   part_start + HFSPLUS_VOLHEAD_SECTOR,
+                                   sbi->s_vhdr, READ);
+        if (error)
+                goto out_free_backup_vhdr;
+        error = -EINVAL;
+        switch (sbi->s_vhdr->signature) {
+        case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX):
+                set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
+                /*FALLTHRU*/
+        case cpu_to_be16(HFSPLUS_VOLHEAD_SIG):
+                break;
+        case cpu_to_be16(HFSP_WRAP_MAGIC):
+                if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
+                        goto out_free_backup_vhdr;
+                wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
+                part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
+                part_size = wd.embed_count * wd.ablk_size;
+                goto reread;
+        default:
+                /*
+                 * Check for a partition block.
+                 *
                 * (should do this only for cdrom/loop though)
                 */
                if (hfs_part_find(sb, &part_start, &part_size))
-                        return -EINVAL;
+                        goto out_free_backup_vhdr;
+                goto reread;
        }
-        blocksize = be32_to_cpu(vhdr->blocksize);
+        error = hfsplus_submit_bio(sb->s_bdev,
-        brelse(bh);
+                                   part_start + part_size - 2,
+                                   sbi->s_backup_vhdr, READ);
+        if (error)
+                goto out_free_backup_vhdr;
+        error = -EINVAL;
+        if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
+                printk(KERN_WARNING
+                        "hfs: invalid secondary volume header\n");
+                goto out_free_backup_vhdr;
+        }
+        blocksize = be32_to_cpu(sbi->s_vhdr->blocksize);
-        /* block size must be at least as large as a sector
+        /*
-         * and a multiple of 2
+         * Block size must be at least as large as a sector and a multiple of 2.
         */
-        if (blocksize < HFSPLUS_SECTOR_SIZE ||
+        if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize))
-            ((blocksize - 1) & blocksize))
+                goto out_free_backup_vhdr;
-                return -EINVAL;
+        sbi->alloc_blksz = blocksize;
-        HFSPLUS_SB(sb).alloc_blksz = blocksize;
+        sbi->alloc_blksz_shift = 0;
-        HFSPLUS_SB(sb).alloc_blksz_shift = 0;
        while ((blocksize >>= 1) != 0)
-                HFSPLUS_SB(sb).alloc_blksz_shift++;
+                sbi->alloc_blksz_shift++;
-        blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE);
+        blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
-        /* align block size to block offset */
+        /*
+         * Align block size to block offset.
+         */
        while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
                blocksize >>= 1;
        if (sb_set_blocksize(sb, blocksize) != blocksize) {
-                printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", blocksize);
+                printk(KERN_ERR "hfs: unable to set blocksize to %u!\n",
-                return -EINVAL;
+                        blocksize);
+                goto out_free_backup_vhdr;
        }
-        HFSPLUS_SB(sb).blockoffset = part_start >>
+        sbi->blockoffset =
-                        (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+                part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
-        HFSPLUS_SB(sb).sect_count = part_size;
+        sbi->part_start = part_start;
-        HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift -
+        sbi->sect_count = part_size;
-                        sb->s_blocksize_bits;
+        sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
-        bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-        if (!bh)
-                return -EIO;
-        /* should still be the same... */
-        if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ?
-                                cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) :
-                                cpu_to_be16(HFSPLUS_VOLHEAD_SIG)))
-                goto error;
-        HFSPLUS_SB(sb).s_vhbh = bh;
-        HFSPLUS_SB(sb).s_vhdr = vhdr;
        return 0;
- error:
-        brelse(bh);
+out_free_backup_vhdr:
-        return -EINVAL;
+        kfree(sbi->s_backup_vhdr);
+out_free_vhdr:
+        kfree(sbi->s_vhdr);
+out:
+        return error;
 }
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 6bbd75c5589b..bf15a43016b9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -28,12 +28,7 @@
 * #define ATTR_KILL_SUID       2048
 * #define ATTR_KILL_SGID       4096
 *
- * and this is because they were added in 2.5 development in this patch:
+ * and this is because they were added in 2.5 development.
- *
- * http://linux.bkbits.net:8080/linux-2.5/
- * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html
- * |src/.|src/include|src/include/linux|related/include/linux/fs.h
- *
 * Actually, they are not needed by most ->setattr() methods - they are set by
 * callers of notify_change() to notify that the setuid/setgid bits must be
 * dropped.
@@ -96,7 +91,6 @@ extern int rename_file(char *from, char *to);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
                     long long *bfree_out, long long *bavail_out,
                     long long *files_out, long long *ffree_out,
-                     void *fsid_out, int fsid_size, long *namelen_out,
+                     void *fsid_out, int fsid_size, long *namelen_out);
-                     long *spare_out);
 #endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f7dc9b5f9ef8..2638c834ed28 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -32,7 +32,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
-static int hostfs_d_delete(struct dentry *dentry)
+static int hostfs_d_delete(const struct dentry *dentry)
 {
        return 1;
 }
@@ -92,12 +92,10 @@ __uml_setup("hostfs=", hostfs_args,
 static char *__dentry_name(struct dentry *dentry, char *name)
 {
-        char *p = __dentry_path(dentry, name, PATH_MAX);
+        char *p = dentry_path_raw(dentry, name, PATH_MAX);
        char *root;
        size_t len;
-        spin_unlock(&dcache_lock);
        root = dentry->d_sb->s_fs_info;
        len = strlen(root);
        if (IS_ERR(p)) {
@@ -123,25 +121,23 @@ static char *dentry_name(struct dentry *dentry)
        if (!name)
                return NULL;
-        spin_lock(&dcache_lock);
        return __dentry_name(dentry, name); /* will unlock */
 }
 static char *inode_name(struct inode *ino)
 {
        struct dentry *dentry;
-        char *name = __getname();
+        char *name;
-        if (!name)
-                return NULL;
-        spin_lock(&dcache_lock);
+        dentry = d_find_alias(ino);
-        if (list_empty(&ino->i_dentry)) {
+        if (!dentry)
-                spin_unlock(&dcache_lock);
-                __putname(name);
                return NULL;
-        }
-        dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias);
+        name = dentry_name(dentry);
-        return __dentry_name(dentry, name); /* will unlock */
+        dput(dentry);
+        return name;
 }
 static char *follow_link(char *link)
@@ -217,7 +213,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
        err = do_statfs(dentry->d_sb->s_fs_info,
                        &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
                        &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
-                        &sf->f_namelen, sf->f_spare);
+                        &sf->f_namelen);
        if (err)
                return err;
        sf->f_blocks = f_blocks;
@@ -251,11 +247,18 @@ static void hostfs_evict_inode(struct inode *inode)
        }
 }
-static void hostfs_destroy_inode(struct inode *inode)
+static void hostfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kfree(HOSTFS_I(inode));
 }
+static void hostfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hostfs_i_callback);
+}
 static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        const char *root_path = vfs->mnt_sb->s_fs_info;
@@ -609,7 +612,6 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
                goto out_put;
        d_add(dentry, inode);
-        dentry->d_op = &hostfs_dentry_ops;
        return NULL;
 out_put:
@@ -746,11 +748,14 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
        return err;
 }
-int hostfs_permission(struct inode *ino, int desired)
+int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
 {
        char *name;
        int r = 0, w = 0, x = 0, err;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        if (desired & MAY_READ) r = 1;
        if (desired & MAY_WRITE) w = 1;
        if (desired & MAY_EXEC) x = 1;
@@ -765,7 +770,7 @@ int hostfs_permission(struct inode *ino, int desired)
                err = access_file(name, r, w, x);
        __putname(name);
        if (!err)
-                err = generic_permission(ino, desired, NULL);
+                err = generic_permission(ino, desired, flags, NULL);
        return err;
 }
@@ -916,6 +921,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = HOSTFS_SUPER_MAGIC;
        sb->s_op = &hostfs_sbops;
+        sb->s_d_op = &hostfs_dentry_ops;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        /* NULL is printed as <NULL> by sprintf: avoid that. */
@@ -962,11 +968,11 @@ out:
        return err;
 }
-static int hostfs_read_sb(struct file_system_type *type,
+static struct dentry *hostfs_read_sb(struct file_system_type *type,
                          int flags, const char *dev_name,
-                          void *data, struct vfsmount *mnt)
+                          void *data)
 {
-        return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
+        return mount_nodev(type, flags, data, hostfs_fill_sb_common);
 }
 static void hostfs_kill_sb(struct super_block *s)
@@ -978,7 +984,7 @@ static void hostfs_kill_sb(struct super_block *s)
 static struct file_system_type hostfs_type = {
        .owner          = THIS_MODULE,
        .name           = "hostfs",
-        .get_sb         = hostfs_read_sb,
+        .mount          = hostfs_read_sb,
        .kill_sb        = hostfs_kill_sb,
        .fs_flags       = 0,
 };
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 6777aa06ce2c..d51a98384bc0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out)
        dir = opendir(path);
        *err_out = errno;
-        if (dir == NULL)
-                return NULL;
        return dir;
 }
@@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
        if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
                if (fd >= 0) {
                        if (fchmod(fd, attrs->ia_mode) != 0)
-                                return (-errno);
+                                return -errno;
                } else if (chmod(file, attrs->ia_mode) != 0) {
                        return -errno;
                }
@@ -364,8 +363,7 @@ int rename_file(char *from, char *to)
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
              long long *bfree_out, long long *bavail_out,
              long long *files_out, long long *ffree_out,
-              void *fsid_out, int fsid_size, long *namelen_out,
+              void *fsid_out, int fsid_size, long *namelen_out)
-              long *spare_out)
 {
        struct statfs64 buf;
        int err;
@@ -384,10 +382,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
               sizeof(buf.f_fsid) > fsid_size ? fsid_size :
               sizeof(buf.f_fsid));
        *namelen_out = buf.f_namelen;
-        spare_out[0] = buf.f_spare[0];
-        spare_out[1] = buf.f_spare[1];
-        spare_out[2] = buf.f_spare[2];
-        spare_out[3] = buf.f_spare[3];
-        spare_out[4] = buf.f_spare[4];
        return 0;
 }
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c5bf6c..63b6f5632318 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,6 +1,7 @@
 config HPFS_FS
        tristate "OS/2 HPFS file system support"
        depends on BLOCK
+        depends on BKL # nontrivial to fix
        help
          OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
          is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index eac5f96323e3..793cb9d943d2 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -14,7 +14,7 @@ void hpfs_lock_creation(struct super_block *s)
 #ifdef DEBUG_LOCKS
        printk("lock creation\n");
 #endif
-        down(&hpfs_sb(s)->hpfs_creation_de);
+        mutex_lock(&hpfs_sb(s)->hpfs_creation_de);
 }
 void hpfs_unlock_creation(struct super_block *s)
@@ -22,7 +22,7 @@ void hpfs_unlock_creation(struct super_block *s)
 #ifdef DEBUG_LOCKS
        printk("unlock creation\n");
 #endif
-        up(&hpfs_sb(s)->hpfs_creation_de);
+        mutex_unlock(&hpfs_sb(s)->hpfs_creation_de);
 }
 /* Map a sector into a buffer and return pointers to it and to the buffer. */
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 67d9d36b3d5f..05d4816e4e77 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,7 +12,8 @@
 * Note: the dentry argument is the parent dentry.
 */
-static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        unsigned long    hash;
        int              i;
@@ -34,29 +35,30 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
        return 0;
 }
-static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int hpfs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        unsigned al=a->len;
+        unsigned al = len;
-        unsigned bl=b->len;
+        unsigned bl = name->len;
-        hpfs_adjust_length(a->name, &al);
+        hpfs_adjust_length(str, &al);
        /*hpfs_adjust_length(b->name, &bl);*/
-        /* 'a' is the qstr of an already existing dentry, so the name
-         * must be valid. 'b' must be validated first.
+        /*
+         * 'str' is the nane of an already existing dentry, so the name
+         * must be valid. 'name' must be validated first.
         */
-        if (hpfs_chk_name(b->name, &bl))
+        if (hpfs_chk_name(name->name, &bl))
                return 1;
-        if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
+        if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
                return 1;
        return 0;
 }
-static const struct dentry_operations hpfs_dentry_operations = {
+const struct dentry_operations hpfs_dentry_operations = {
        .d_hash         = hpfs_hash_dentry,
        .d_compare      = hpfs_compare_dentry,
 };
-void hpfs_set_dentry_operations(struct dentry *dentry)
-{
-        dentry->d_op = &hpfs_dentry_operations;
-}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2338130cceba..d32f63a569f7 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -298,7 +298,6 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        end:
        end_add:
-        hpfs_set_dentry_operations(dentry);
        unlock_kernel();
        d_add(dentry, result);
        return NULL;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b59eac0232a0..1c43dbea55e8 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -87,7 +87,7 @@ struct hpfs_sb_info {
        unsigned *sb_bmp_dir;           /* main bitmap directory */
        unsigned sb_c_bitmap;           /* current bitmap */
        unsigned sb_max_fwd_alloc;      /* max forwad allocation */
-        struct semaphore hpfs_creation_de; /* when creating dirents, nobody else
+        struct mutex hpfs_creation_de;  /* when creating dirents, nobody else
                                           can alloc blocks */
        /*unsigned sb_mounting : 1;*/
        int sb_timeshift;
@@ -233,7 +233,7 @@ void hpfs_mark_4buffers_dirty(struct quad_buffer_head *);
 /* dentry.c */
-void hpfs_set_dentry_operations(struct dentry *);
+extern const struct dentry_operations hpfs_dentry_operations;
 /* dir.c */
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f0da1cfd10..1ae35baa539e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
            attr->ia_size != i_size_read(inode)) {
                error = vmtruncate(inode, attr->ia_size);
                if (error)
-                        return error;
+                        goto out_unlock;
        }
        setattr_copy(inode, attr);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 11c2b4080f65..f4ad9e31ddc4 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -419,7 +419,7 @@ again:
                        unlock_kernel();
                        return -ENOSPC;
                }
-                if (generic_permission(inode, MAY_WRITE, NULL) ||
+                if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
                    !S_ISREG(inode->i_mode) ||
                    get_write_access(inode)) {
                        d_rehash(dentry);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 2607010be2fe..b30426b1fc97 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -177,11 +177,18 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void hpfs_destroy_inode(struct inode *inode)
+static void hpfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
+static void hpfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hpfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
@@ -477,17 +484,21 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        int o;
+        lock_kernel();
        save_mount_options(s, options);
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-        if (!sbi)
+        if (!sbi) {
+                unlock_kernel();
                return -ENOMEM;
+        }
        s->s_fs_info = sbi;
        sbi->sb_bmp_dir = NULL;
        sbi->sb_cp_table = NULL;
-        init_MUTEX(&sbi->hpfs_creation_de);
+        mutex_init(&sbi->hpfs_creation_de);
        uid = current_uid();
        gid = current_gid();
@@ -539,6 +550,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        /* Fill superblock stuff */
        s->s_magic = HPFS_SUPER_MAGIC;
        s->s_op = &hpfs_sops;
+        s->s_d_op = &hpfs_dentry_operations;
        sbi->sb_root = superblock->root;
        sbi->sb_fs_size = superblock->n_sectors;
@@ -640,7 +652,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                iput(root);
                goto bail0;
        }
-        hpfs_set_dentry_operations(s->s_root);
        /*
         * find the root directory's . pointer & finish filling in the inode
@@ -666,6 +677,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                        root->i_blocks = 5;
                hpfs_brelse4(&qbh);
        }
+        unlock_kernel();
        return 0;
 bail4:  brelse(bh2);
@@ -677,20 +689,20 @@ bail0:
        kfree(sbi->sb_cp_table);
        s->s_fs_info = NULL;
        kfree(sbi);
+        unlock_kernel();
        return -EINVAL;
 }
-static int hpfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *hpfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
-                           mnt);
 }
 static struct file_system_type hpfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "hpfs",
-        .get_sb         = hpfs_get_sb,
+        .mount          = hpfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 7b027720d820..87ed48e0343d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -598,6 +598,7 @@ static const struct file_operations hppfs_dir_fops = {
        .readdir        = hppfs_readdir,
        .open           = hppfs_dir_open,
        .fsync          = hppfs_fsync,
+        .llseek         = default_llseek,
 };
 static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
@@ -631,11 +632,18 @@ void hppfs_evict_inode(struct inode *ino)
        mntput(ino->i_sb->s_fs_info);
 }
-static void hppfs_destroy_inode(struct inode *inode)
+static void hppfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kfree(HPPFS_I(inode));
 }
+static void hppfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hppfs_i_callback);
+}
 static const struct super_operations hppfs_sbops = {
        .alloc_inode    = hppfs_alloc_inode,
        .destroy_inode  = hppfs_destroy_inode,
@@ -747,17 +755,17 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
        return(err);
 }
-static int hppfs_read_super(struct file_system_type *type,
+static struct dentry *hppfs_read_super(struct file_system_type *type,
                            int flags, const char *dev_name,
-                            void *data, struct vfsmount *mnt)
+                            void *data)
 {
-        return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
+        return mount_nodev(type, flags, data, hppfs_fill_super);
 }
 static struct file_system_type hppfs_type = {
        .owner          = THIS_MODULE,
        .name           = "hppfs",
-        .get_sb         = hppfs_read_super,
+        .mount          = hppfs_read_super,
        .kill_sb        = kill_anon_super,
        .fs_flags       = 0,
 };
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6e5bd42f3860..9885082b470f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/magic.h>
+#include <linux/migrate.h>
 #include <asm/uaccess.h>
@@ -455,6 +456,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
        inode = new_inode(sb);
        if (inode) {
                struct hugetlbfs_inode_info *info;
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = uid;
                inode->i_gid = gid;
@@ -573,6 +575,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
        return 0;
 }
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+                                struct page *newpage, struct page *page)
+{
+        int rc;
+        rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+        if (rc)
+                return rc;
+        migrate_page_copy(newpage, page);
+        return 0;
+}
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -648,17 +663,25 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
        return &p->vfs_inode;
 }
+static void hugetlbfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+}
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
        hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
        mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
-        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+        call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
 }
 static const struct address_space_operations hugetlbfs_aops = {
        .write_begin    = hugetlbfs_write_begin,
        .write_end      = hugetlbfs_write_end,
        .set_page_dirty = hugetlbfs_set_page_dirty,
+        .migratepage    = hugetlbfs_migrate_page,
 };
@@ -674,6 +697,7 @@ const struct file_operations hugetlbfs_file_operations = {
        .mmap                   = hugetlbfs_file_mmap,
        .fsync                  = noop_fsync,
        .get_unmapped_area      = hugetlb_get_unmapped_area,
+        .llseek         = default_llseek,
 };
 static const struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -879,15 +903,15 @@ void hugetlb_put_quota(struct address_space *mapping, long delta)
        }
 }
-static int hugetlbfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
 }
 static struct file_system_type hugetlbfs_fs_type = {
        .name           = "hugetlbfs",
-        .get_sb         = hugetlbfs_get_sb,
+        .mount          = hugetlbfs_mount,
        .kill_sb        = kill_litter_super,
 };
@@ -915,8 +939,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
                *user = current_user();
                if (user_shm_lock(size, *user)) {
-                        WARN_ONCE(1,
+                        printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
-                          "Using mlock ulimits for SHM_HUGETLB deprecated\n");
                } else {
                        *user = NULL;
                        return ERR_PTR(-EPERM);
diff --git a/fs/inode.c b/fs/inode.c
index 86464332e590..da85e56378f3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -24,11 +24,11 @@
 #include <linux/mount.h>
 #include <linux/async.h>
 #include <linux/posix_acl.h>
+#include <linux/ima.h>
 /*
 * This is needed for the following functions:
 *  - inode_has_buffers
- *  - invalidate_inode_buffers
 *  - invalidate_bdev
 *
 * FIXME: remove all knowledge of the buffer layer from this file
@@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
 * allowing for low-overhead inode sync() operations.
 */
-LIST_HEAD(inode_in_use);
+static LIST_HEAD(inode_lru);
-LIST_HEAD(inode_unused);
 static struct hlist_head *inode_hashtable __read_mostly;
 /*
@@ -103,8 +102,43 @@ static DECLARE_RWSEM(iprune_sem);
 */
 struct inodes_stat_t inodes_stat;
+static DEFINE_PER_CPU(unsigned int, nr_inodes);
 static struct kmem_cache *inode_cachep __read_mostly;
+static int get_nr_inodes(void)
+{
+        int i;
+        int sum = 0;
+        for_each_possible_cpu(i)
+                sum += per_cpu(nr_inodes, i);
+        return sum < 0 ? 0 : sum;
+}
+static inline int get_nr_inodes_unused(void)
+{
+        return inodes_stat.nr_unused;
+}
+int get_nr_dirty_inodes(void)
+{
+        /* not actually dirty inodes, but a wild approximation */
+        int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+        return nr_dirty > 0 ? nr_dirty : 0;
+}
+/*
+ * Handle nr_inode sysctl
+ */
+#ifdef CONFIG_SYSCTL
+int proc_nr_inodes(ctl_table *table, int write,
+                   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        inodes_stat.nr_inodes = get_nr_inodes();
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
 static void wake_up_inode(struct inode *inode)
 {
        /*
@@ -192,6 +226,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_fsnotify_mask = 0;
 #endif
+        this_cpu_inc(nr_inodes);
        return 0;
 out:
        return -ENOMEM;
@@ -221,6 +257,12 @@ static struct inode *alloc_inode(struct super_block *sb)
        return inode;
 }
+void free_inode_nonrcu(struct inode *inode)
+{
+        kmem_cache_free(inode_cachep, inode);
+}
+EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
@@ -232,16 +274,25 @@ void __destroy_inode(struct inode *inode)
        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
                posix_acl_release(inode->i_default_acl);
 #endif
+        this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
-void destroy_inode(struct inode *inode)
+static void i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(inode_cachep, inode);
+}
+static void destroy_inode(struct inode *inode)
+{
+        BUG_ON(!list_empty(&inode->i_lru));
        __destroy_inode(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
-                kmem_cache_free(inode_cachep, (inode));
+                call_rcu(&inode->i_rcu, i_callback);
 }
 /*
@@ -255,6 +306,8 @@ void inode_init_once(struct inode *inode)
        INIT_HLIST_NODE(&inode->i_hash);
        INIT_LIST_HEAD(&inode->i_dentry);
        INIT_LIST_HEAD(&inode->i_devices);
+        INIT_LIST_HEAD(&inode->i_wb_list);
+        INIT_LIST_HEAD(&inode->i_lru);
        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
        spin_lock_init(&inode->i_data.tree_lock);
        spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -281,13 +334,108 @@ static void init_once(void *foo)
 */
 void __iget(struct inode *inode)
 {
-        if (atomic_inc_return(&inode->i_count) != 1)
+        atomic_inc(&inode->i_count);
-                return;
+}
+/*
+ * get additional reference to inode; caller must already hold one.
+ */
+void ihold(struct inode *inode)
+{
+        WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+}
+EXPORT_SYMBOL(ihold);
+static void inode_lru_list_add(struct inode *inode)
+{
+        if (list_empty(&inode->i_lru)) {
+                list_add(&inode->i_lru, &inode_lru);
+                inodes_stat.nr_unused++;
+        }
+}
+static void inode_lru_list_del(struct inode *inode)
+{
+        if (!list_empty(&inode->i_lru)) {
+                list_del_init(&inode->i_lru);
+                inodes_stat.nr_unused--;
+        }
+}
+static inline void __inode_sb_list_add(struct inode *inode)
+{
+        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
+}
+/**
+ * inode_sb_list_add - add inode to the superblock list of inodes
+ * @inode: inode to add
+ */
+void inode_sb_list_add(struct inode *inode)
+{
+        spin_lock(&inode_lock);
+        __inode_sb_list_add(inode);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_sb_list_add);
+static inline void __inode_sb_list_del(struct inode *inode)
+{
+        list_del_init(&inode->i_sb_list);
+}
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+        unsigned long tmp;
+        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+                        L1_CACHE_BYTES;
+        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+        return tmp & I_HASHMASK;
+}
+/**
+ *      __insert_inode_hash - hash an inode
+ *      @inode: unhashed inode
+ *      @hashval: unsigned long value used to locate this object in the
+ *              inode_hashtable.
+ *
+ *      Add an inode to the inode hash for this superblock.
+ */
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+        spin_lock(&inode_lock);
+        hlist_add_head(&inode->i_hash, b);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(__insert_inode_hash);
+/**
+ *      __remove_inode_hash - remove an inode from the hash
+ *      @inode: inode to unhash
+ *
+ *      Remove an inode from the superblock.
+ */
+static void __remove_inode_hash(struct inode *inode)
+{
+        hlist_del_init(&inode->i_hash);
+}
-        if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+/**
-                list_move(&inode->i_list, &inode_in_use);
+ *      remove_inode_hash - remove an inode from the hash
-        inodes_stat.nr_unused--;
+ *      @inode: inode to unhash
+ *
+ *      Remove an inode from the superblock.
+ */
+void remove_inode_hash(struct inode *inode)
+{
+        spin_lock(&inode_lock);
+        hlist_del_init(&inode->i_hash);
+        spin_unlock(&inode_lock);
 }
+EXPORT_SYMBOL(remove_inode_hash);
 void end_writeback(struct inode *inode)
 {
@@ -297,6 +445,7 @@ void end_writeback(struct inode *inode)
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        inode_sync_wait(inode);
+        /* don't need i_lock here, no concurrent mods to i_state */
        inode->i_state = I_FREEING | I_CLEAR;
 }
 EXPORT_SYMBOL(end_writeback);
@@ -327,101 +476,113 @@ static void evict(struct inode *inode)
 */
 static void dispose_list(struct list_head *head)
 {
-        int nr_disposed = 0;
        while (!list_empty(head)) {
                struct inode *inode;
-                inode = list_first_entry(head, struct inode, i_list);
+                inode = list_first_entry(head, struct inode, i_lru);
-                list_del(&inode->i_list);
+                list_del_init(&inode->i_lru);
                evict(inode);
                spin_lock(&inode_lock);
-                hlist_del_init(&inode->i_hash);
+                __remove_inode_hash(inode);
-                list_del_init(&inode->i_sb_list);
+                __inode_sb_list_del(inode);
                spin_unlock(&inode_lock);
                wake_up_inode(inode);
                destroy_inode(inode);
-                nr_disposed++;
        }
-        spin_lock(&inode_lock);
-        inodes_stat.nr_inodes -= nr_disposed;
-        spin_unlock(&inode_lock);
 }
-/*
+/**
- * Invalidate all inodes for a device.
+ * evict_inodes - evict all evictable inodes for a superblock
+ * @sb:         superblock to operate on
+ *
+ * Make sure that no inodes with zero refcount are retained.  This is
+ * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * so any inode reaching zero refcount during or after that call will
+ * be immediately evicted.
 */
-static int invalidate_list(struct list_head *head, struct list_head *dispose)
+void evict_inodes(struct super_block *sb)
 {
-        struct list_head *next;
+        struct inode *inode, *next;
-        int busy = 0, count = 0;
+        LIST_HEAD(dispose);
-        next = head->next;
-        for (;;) {
-                struct list_head *tmp = next;
-                struct inode *inode;
-                /*
+        down_write(&iprune_sem);
-                 * We can reschedule here without worrying about the list's
-                 * consistency because the per-sb list of inodes must not
-                 * change during umount anymore, and because iprune_sem keeps
-                 * shrink_icache_memory() away.
-                 */
-                cond_resched_lock(&inode_lock);
-                next = next->next;
+        spin_lock(&inode_lock);
-                if (tmp == head)
+        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-                        break;
+                if (atomic_read(&inode->i_count))
-                inode = list_entry(tmp, struct inode, i_sb_list);
-                if (inode->i_state & I_NEW)
                        continue;
-                invalidate_inode_buffers(inode);
-                if (!atomic_read(&inode->i_count)) {
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-                        list_move(&inode->i_list, dispose);
+                        WARN_ON(1);
-                        WARN_ON(inode->i_state & I_NEW);
-                        inode->i_state |= I_FREEING;
-                        count++;
                        continue;
                }
-                busy = 1;
+                inode->i_state |= I_FREEING;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &dispose);
+                list_del_init(&inode->i_wb_list);
+                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+                        inodes_stat.nr_unused--;
        }
-        /* only unused inodes may be cached with i_count zero */
+        spin_unlock(&inode_lock);
-        inodes_stat.nr_unused -= count;
-        return busy;
+        dispose_list(&dispose);
+        up_write(&iprune_sem);
 }
 /**
- *      invalidate_inodes       - discard the inodes on a device
+ * invalidate_inodes    - attempt to free all inodes on a superblock
- *      @sb: superblock
+ * @sb:         superblock to operate on
 *
- *      Discard all of the inodes for a given superblock. If the discard
+ * Attempts to free all inodes for a given superblock.  If there were any
- *      fails because there are busy inodes then a non zero value is returned.
+ * busy inodes return a non-zero value, else zero.
- *      If the discard is successful all the inodes have been discarded.
 */
 int invalidate_inodes(struct super_block *sb)
 {
-        int busy;
+        int busy = 0;
-        LIST_HEAD(throw_away);
+        struct inode *inode, *next;
+        LIST_HEAD(dispose);
        down_write(&iprune_sem);
        spin_lock(&inode_lock);
-        fsnotify_unmount_inodes(&sb->s_inodes);
+        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-        busy = invalidate_list(&sb->s_inodes, &throw_away);
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+                        continue;
+                if (atomic_read(&inode->i_count)) {
+                        busy = 1;
+                        continue;
+                }
+                inode->i_state |= I_FREEING;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &dispose);
+                list_del_init(&inode->i_wb_list);
+                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+                        inodes_stat.nr_unused--;
+        }
        spin_unlock(&inode_lock);
-        dispose_list(&throw_away);
+        dispose_list(&dispose);
        up_write(&iprune_sem);
        return busy;
 }
-EXPORT_SYMBOL(invalidate_inodes);
 static int can_unuse(struct inode *inode)
 {
-        if (inode->i_state)
+        if (inode->i_state & ~I_REFERENCED)
                return 0;
        if (inode_has_buffers(inode))
                return 0;
@@ -433,22 +594,24 @@ static int can_unuse(struct inode *inode)
 }
 /*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * a temporary list and then are freed outside inode_lock by dispose_list().
+ * temporary list and then are freed outside inode_lock by dispose_list().
 *
 * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed.  We expect the final iput() on that inode to add it to
+ * pagecache removed.  If the inode has metadata buffers attached to
- * the front of the inode_unused list.  So look for it there and if the
+ * mapping->private_list then try to remove them.
- * inode is still freeable, proceed.  The right inode is found 99.9% of the
- * time in testing on a 4-way.
 *
- * If the inode has metadata buffers attached to mapping->private_list then
+ * If the inode has the I_REFERENCED flag set, then it means that it has been
- * try to remove them.
+ * used recently - the flag is set in iput_final(). When we encounter such an
+ * inode, clear the flag and move it to the back of the LRU so it gets another
+ * pass through the LRU before it gets reclaimed. This is necessary because of
+ * the fact we are doing lazy LRU updates to minimise lock contention so the
+ * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+ * with this flag set because they are the inodes that are out of order.
 */
 static void prune_icache(int nr_to_scan)
 {
        LIST_HEAD(freeable);
-        int nr_pruned = 0;
        int nr_scanned;
        unsigned long reap = 0;
@@ -457,13 +620,26 @@ static void prune_icache(int nr_to_scan)
        for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
                struct inode *inode;
-                if (list_empty(&inode_unused))
+                if (list_empty(&inode_lru))
                        break;
-                inode = list_entry(inode_unused.prev, struct inode, i_list);
+                inode = list_entry(inode_lru.prev, struct inode, i_lru);
+                /*
+                 * Referenced or dirty inodes are still in use. Give them
+                 * another pass through the LRU as we canot reclaim them now.
+                 */
+                if (atomic_read(&inode->i_count) ||
+                    (inode->i_state & ~I_REFERENCED)) {
+                        list_del_init(&inode->i_lru);
+                        inodes_stat.nr_unused--;
+                        continue;
+                }
-                if (inode->i_state || atomic_read(&inode->i_count)) {
+                /* recently referenced inodes get one more pass */
-                        list_move(&inode->i_list, &inode_unused);
+                if (inode->i_state & I_REFERENCED) {
+                        list_move(&inode->i_lru, &inode_lru);
+                        inode->i_state &= ~I_REFERENCED;
                        continue;
                }
                if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -475,18 +651,23 @@ static void prune_icache(int nr_to_scan)
                        iput(inode);
                        spin_lock(&inode_lock);
-                        if (inode != list_entry(inode_unused.next,
+                        if (inode != list_entry(inode_lru.next,
-                                                struct inode, i_list))
+                                                struct inode, i_lru))
                                continue;       /* wrong inode or list_empty */
                        if (!can_unuse(inode))
                                continue;
                }
-                list_move(&inode->i_list, &freeable);
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_FREEING;
-                nr_pruned++;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &freeable);
+                list_del_init(&inode->i_wb_list);
+                inodes_stat.nr_unused--;
        }
-        inodes_stat.nr_unused -= nr_pruned;
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
        else
@@ -518,7 +699,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
                        return -1;
                prune_icache(nr);
        }
-        return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+        return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker icache_shrinker = {
@@ -529,9 +710,6 @@ static struct shrinker icache_shrinker = {
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
 * Called with the inode lock held.
- * NOTE: we are not increasing the inode-refcount, you must call __iget()
- * by hand after calling find_inode now! This simplifies iunique and won't
- * add any additional branch in the common code.
 */
 static struct inode *find_inode(struct super_block *sb,
                                struct hlist_head *head,
@@ -551,9 +729,10 @@ repeat:
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
-                break;
+                __iget(inode);
+                return inode;
        }
-        return node ? inode : NULL;
+        return NULL;
 }
 /*
@@ -576,53 +755,49 @@ repeat:
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
-                break;
+                __iget(inode);
+                return inode;
        }
-        return node ? inode : NULL;
+        return NULL;
-}
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-        unsigned long tmp;
-        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-                        L1_CACHE_BYTES;
-        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-        return tmp & I_HASHMASK;
-}
-static inline void
-__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
-                        struct inode *inode)
-{
-        inodes_stat.nr_inodes++;
-        list_add(&inode->i_list, &inode_in_use);
-        list_add(&inode->i_sb_list, &sb->s_inodes);
-        if (head)
-                hlist_add_head(&inode->i_hash, head);
 }
-/**
+/*
- * inode_add_to_lists - add a new inode to relevant lists
+ * Each cpu owns a range of LAST_INO_BATCH numbers.
- * @sb: superblock inode belongs to
+ * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
- * @inode: inode to mark in use
+ * to renew the exhausted range.
 *
- * When an inode is allocated it needs to be accounted for, added to the in use
+ * This does not significantly increase overflow rate because every CPU can
- * list, the owning superblock and the inode hash. This needs to be done under
+ * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
- * the inode_lock, so export a function to do this rather than the inode lock
+ * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
- * itself. We calculate the hash list to add to here so it is all internal
+ * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
- * which requires the caller to have already set up the inode number in the
+ * overflow rate by 2x, which does not seem too significant.
- * inode to add.
+ *
+ * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+ * error if st_ino won't fit in target struct field. Use 32bit counter
+ * here to attempt to avoid that.
 */
-void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+#define LAST_INO_BATCH 1024
+static DEFINE_PER_CPU(unsigned int, last_ino);
+unsigned int get_next_ino(void)
 {
-        struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+        unsigned int *p = &get_cpu_var(last_ino);
+        unsigned int res = *p;
-        spin_lock(&inode_lock);
+#ifdef CONFIG_SMP
-        __inode_add_to_lists(sb, head, inode);
+        if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
-        spin_unlock(&inode_lock);
+                static atomic_t shared_last_ino;
+                int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+                res = next - LAST_INO_BATCH;
+        }
+#endif
+        *p = ++res;
+        put_cpu_var(last_ino);
+        return res;
 }
-EXPORT_SYMBOL_GPL(inode_add_to_lists);
+EXPORT_SYMBOL(get_next_ino);
 /**
 *      new_inode       - obtain an inode
@@ -638,12 +813,6 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
 */
 struct inode *new_inode(struct super_block *sb)
 {
-        /*
-         * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
-         * error if st_ino won't fit in target struct field. Use 32bit counter
-         * here to attempt to avoid that.
-         */
-        static unsigned int last_ino;
        struct inode *inode;
        spin_lock_prefetch(&inode_lock);
@@ -651,8 +820,7 @@ struct inode *new_inode(struct super_block *sb)
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
-                __inode_add_to_lists(sb, NULL, inode);
+                __inode_sb_list_add(inode);
-                inode->i_ino = ++last_ino;
                inode->i_state = 0;
                spin_unlock(&inode_lock);
        }
@@ -663,7 +831,7 @@ EXPORT_SYMBOL(new_inode);
 void unlock_new_inode(struct inode *inode)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-        if (inode->i_mode & S_IFDIR) {
+        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;
                /* Set new key only if filesystem hasn't already changed it */
@@ -720,7 +888,8 @@ static struct inode *get_new_inode(struct super_block *sb,
                        if (set(inode, data))
                                goto set_failed;
-                        __inode_add_to_lists(sb, head, inode);
+                        hlist_add_head(&inode->i_hash, head);
+                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
@@ -735,7 +904,6 @@ static struct inode *get_new_inode(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                __iget(old);
                spin_unlock(&inode_lock);
                destroy_inode(inode);
                inode = old;
@@ -767,7 +935,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                        __inode_add_to_lists(sb, head, inode);
+                        hlist_add_head(&inode->i_hash, head);
+                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
@@ -782,7 +951,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                __iget(old);
                spin_unlock(&inode_lock);
                destroy_inode(inode);
                inode = old;
@@ -791,6 +959,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
        return inode;
 }
+/*
+ * search the inode cache for a matching inode number.
+ * If we find one, then the inode number we are trying to
+ * allocate is not unique and so we should not use it.
+ *
+ * Returns 1 if the inode number is unique, 0 if it is not.
+ */
+static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+{
+        struct hlist_head *b = inode_hashtable + hash(sb, ino);
+        struct hlist_node *node;
+        struct inode *inode;
+        hlist_for_each_entry(inode, node, b, i_hash) {
+                if (inode->i_ino == ino && inode->i_sb == sb)
+                        return 0;
+        }
+        return 1;
+}
 /**
 *      iunique - get a unique inode number
 *      @sb: superblock
@@ -812,19 +1001,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
         * error if st_ino won't fit in target struct field. Use 32bit counter
         * here to attempt to avoid that.
         */
+        static DEFINE_SPINLOCK(iunique_lock);
        static unsigned int counter;
-        struct inode *inode;
-        struct hlist_head *head;
        ino_t res;
        spin_lock(&inode_lock);
+        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
                        counter = max_reserved + 1;
                res = counter++;
-                head = inode_hashtable + hash(sb, res);
+        } while (!test_inode_iunique(sb, res));
-                inode = find_inode_fast(sb, head, res);
+        spin_unlock(&iunique_lock);
-        } while (inode != NULL);
        spin_unlock(&inode_lock);
        return res;
@@ -876,7 +1064,6 @@ static struct inode *ifind(struct super_block *sb,
        spin_lock(&inode_lock);
        inode = find_inode(sb, head, test, data);
        if (inode) {
-                __iget(inode);
                spin_unlock(&inode_lock);
                if (likely(wait))
                        wait_on_inode(inode);
@@ -909,7 +1096,6 @@ static struct inode *ifind_fast(struct super_block *sb,
        spin_lock(&inode_lock);
        inode = find_inode_fast(sb, head, ino);
        if (inode) {
-                __iget(inode);
                spin_unlock(&inode_lock);
                wait_on_inode(inode);
                return inode;
@@ -1095,7 +1281,7 @@ int insert_inode_locked(struct inode *inode)
                __iget(old);
                spin_unlock(&inode_lock);
                wait_on_inode(old);
-                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
@@ -1134,7 +1320,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                __iget(old);
                spin_unlock(&inode_lock);
                wait_on_inode(old);
-                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
@@ -1143,36 +1329,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 }
 EXPORT_SYMBOL(insert_inode_locked4);
-/**
- *      __insert_inode_hash - hash an inode
- *      @inode: unhashed inode
- *      @hashval: unsigned long value used to locate this object in the
- *              inode_hashtable.
- *
- *      Add an inode to the inode hash for this superblock.
- */
-void __insert_inode_hash(struct inode *inode, unsigned long hashval)
-{
-        struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
-        spin_lock(&inode_lock);
-        hlist_add_head(&inode->i_hash, head);
-        spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(__insert_inode_hash);
-/**
- *      remove_inode_hash - remove an inode from the hash
- *      @inode: inode to unhash
- *
- *      Remove an inode from the superblock.
- */
-void remove_inode_hash(struct inode *inode)
-{
-        spin_lock(&inode_lock);
-        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(remove_inode_hash);
 int generic_delete_inode(struct inode *inode)
 {
@@ -1187,7 +1343,7 @@ EXPORT_SYMBOL(generic_delete_inode);
 */
 int generic_drop_inode(struct inode *inode)
 {
-        return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
+        return !inode->i_nlink || inode_unhashed(inode);
 }
 EXPORT_SYMBOL_GPL(generic_drop_inode);
@@ -1213,10 +1369,11 @@ static void iput_final(struct inode *inode)
                drop = generic_drop_inode(inode);
        if (!drop) {
-                if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-                        list_move(&inode->i_list, &inode_unused);
-                inodes_stat.nr_unused++;
                if (sb->s_flags & MS_ACTIVE) {
+                        inode->i_state |= I_REFERENCED;
+                        if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+                                inode_lru_list_add(inode);
+                        }
                        spin_unlock(&inode_lock);
                        return;
                }
@@ -1227,19 +1384,23 @@ static void iput_final(struct inode *inode)
                spin_lock(&inode_lock);
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state &= ~I_WILL_FREE;
-                inodes_stat.nr_unused--;
+                __remove_inode_hash(inode);
-                hlist_del_init(&inode->i_hash);
        }
-        list_del_init(&inode->i_list);
-        list_del_init(&inode->i_sb_list);
        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
-        inodes_stat.nr_inodes--;
+        /*
+         * Move the inode off the IO lists and LRU once I_FREEING is
+         * set so that it won't get moved back on there if it is dirty.
+         */
+        inode_lru_list_del(inode);
+        list_del_init(&inode->i_wb_list);
+        __inode_sb_list_del(inode);
        spin_unlock(&inode_lock);
        evict(inode);
-        spin_lock(&inode_lock);
+        remove_inode_hash(inode);
-        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
        wake_up_inode(inode);
        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
        destroy_inode(inode);
diff --git a/fs/internal.h b/fs/internal.h
index a6910e91cee8..0663568b1247 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -63,12 +63,17 @@ extern int copy_mount_string(const void __user *, char **);
 extern void free_vfsmnt(struct vfsmount *);
 extern struct vfsmount *alloc_vfsmnt(const char *);
+extern unsigned int mnt_get_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
                                struct vfsmount *);
 extern void release_mounts(struct list_head *);
 extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int finish_automount(struct vfsmount *, struct path *);
+extern void mnt_make_longterm(struct vfsmount *);
+extern void mnt_make_shortterm(struct vfsmount *);
 extern void __init mnt_init(void);
@@ -101,3 +106,10 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+/*
+ * inode.c
+ */
+extern int get_nr_dirty_inodes(void);
+extern void evict_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f855ea4fc888..1eebeb72b202 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -6,7 +6,6 @@
 #include <linux/syscalls.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -87,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
                            u64 phys, u64 len, u32 flags)
 {
        struct fiemap_extent extent;
-        struct fiemap_extent *dest = fieinfo->fi_extents_start;
+        struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
        /* only count the extents */
        if (fieinfo->fi_extents_max == 0) {
@@ -174,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
 static int ioctl_fiemap(struct file *filp, unsigned long arg)
 {
        struct fiemap fiemap;
+        struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
        struct fiemap_extent_info fieinfo = { 0, };
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct super_block *sb = inode->i_sb;
@@ -183,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
        if (!inode->i_op->fiemap)
                return -EOPNOTSUPP;
-        if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
+        if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
-                           sizeof(struct fiemap)))
                return -EFAULT;
        if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -197,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
        fieinfo.fi_flags = fiemap.fm_flags;
        fieinfo.fi_extents_max = fiemap.fm_extent_count;
-        fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+        fieinfo.fi_extents_start = ufiemap->fm_extents;
        if (fiemap.fm_extent_count != 0 &&
            !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -210,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
        error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
        fiemap.fm_flags = fieinfo.fi_flags;
        fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
-        if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+        if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
                error = -EFAULT;
        return error;
@@ -274,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode,
                len = isize;
        }
+        /*
+         * Some filesystems can't deal with being asked to map less than
+         * blocksize, so make sure our len is at least block length.
+         */
+        if (logical_to_blk(inode, len) == 0)
+                len = blk_to_logical(inode, 1);
        start_blk = logical_to_blk(inode, start);
        last_blk = logical_to_blk(inode, start + len - 1);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 748cfb92dcc6..7da2a06508e5 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -103,12 +103,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
        }
        ret = -ESRCH;
-        /*
+        rcu_read_lock();
-         * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic",
-         * so we can't use rcu_read_lock(). See re-copy of ->ioprio
-         * in copy_process().
-         */
-        read_lock(&tasklist_lock);
        switch (which) {
                case IOPRIO_WHO_PROCESS:
                        if (!who)
@@ -153,7 +148,7 @@ free_uid:
                        ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
@@ -197,7 +192,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
        int ret = -ESRCH;
        int tmpio;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        switch (which) {
                case IOPRIO_WHO_PROCESS:
                        if (!who)
@@ -250,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
                        ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index e0aca9a0ac68..0542b6eedf80 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -10,7 +10,6 @@
 *
 *  isofs directory handling functions
 */
-#include <linux/smp_lock.h>
 #include <linux/gfp.h>
 #include "isofs.h"
@@ -255,18 +254,19 @@ static int isofs_readdir(struct file *filp,
        char *tmpname;
        struct iso_directory_record *tmpde;
        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
        tmpname = (char *)__get_free_page(GFP_KERNEL);
        if (tmpname == NULL)
                return -ENOMEM;
-        lock_kernel();
+        mutex_lock(&sbi->s_mutex);
        tmpde = (struct iso_directory_record *) (tmpname+1024);
        result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
        free_page((unsigned long) tmpname);
-        unlock_kernel();
+        mutex_unlock(&sbi->s_mutex);
        return result;
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 5a44811b5027..a0f3833c0dbf 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/nls.h>
 #include <linux/ctype.h>
-#include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include <linux/cdrom.h>
 #include <linux/parser.h>
@@ -27,16 +26,32 @@
 #define BEQUIET
-static int isofs_hashi(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
-static int isofs_hash(struct dentry *parent, struct qstr *qstr);
+                struct qstr *qstr);
-static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_hash(const struct dentry *parent, const struct inode *inode,
-static int isofs_dentry_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b);
+                struct qstr *qstr);
+static int isofs_dentry_cmpi(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 #ifdef CONFIG_JOLIET
-static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
-static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr);
+                struct qstr *qstr);
-static int isofs_dentry_cmpi_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
-static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
+                struct qstr *qstr);
+static int isofs_dentry_cmpi_ms(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp_ms(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 #endif
 static void isofs_put_super(struct super_block *sb)
@@ -44,11 +59,7 @@ static void isofs_put_super(struct super_block *sb)
        struct isofs_sb_info *sbi = ISOFS_SB(sb);
 #ifdef CONFIG_JOLIET
-        lock_kernel();
        unload_nls(sbi->s_nls_iocharset);
-        unlock_kernel();
 #endif
        kfree(sbi);
@@ -70,11 +81,18 @@ static struct inode *isofs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void isofs_destroy_inode(struct inode *inode)
+static void isofs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
+static void isofs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, isofs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct iso_inode_info *ei = foo;
@@ -165,7 +183,7 @@ struct iso9660_options{
 * Compute the hash for the isofs name corresponding to the dentry.
 */
 static int
-isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
        const char *name;
        int len;
@@ -186,7 +204,7 @@ isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
 * Compute the hash for the isofs name corresponding to the dentry.
 */
 static int
-isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
        const char *name;
        int len;
@@ -211,100 +229,94 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
 }
 /*
- * Case insensitive compare of two isofs names.
+ * Compare of two isofs names.
- */
-static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a,
-                                struct qstr *b, int ms)
-{
-        int alen, blen;
-        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = a->len;
-        blen = b->len;
-        if (ms) {
-                while (alen && a->name[alen-1] == '.')
-                        alen--;
-                while (blen && b->name[blen-1] == '.')
-                        blen--;
-        }
-        if (alen == blen) {
-                if (strnicmp(a->name, b->name, alen) == 0)
-                        return 0;
-        }
-        return 1;
-}
-/*
- * Case sensitive compare of two isofs names.
 */
-static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a,
+static int isofs_dentry_cmp_common(
-                                        struct qstr *b, int ms)
+                unsigned int len, const char *str,
+                const struct qstr *name, int ms, int ci)
 {
        int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = a->len;
+        alen = name->len;
-        blen = b->len;
+        blen = len;
        if (ms) {
-                while (alen && a->name[alen-1] == '.')
+                while (alen && name->name[alen-1] == '.')
                        alen--;
-                while (blen && b->name[blen-1] == '.')
+                while (blen && str[blen-1] == '.')
                        blen--;
        }
        if (alen == blen) {
-                if (strncmp(a->name, b->name, alen) == 0)
+                if (ci) {
-                        return 0;
+                        if (strnicmp(name->name, str, alen) == 0)
+                                return 0;
+                } else {
+                        if (strncmp(name->name, str, alen) == 0)
+                                return 0;
+                }
        }
        return 1;
 }
 static int
-isofs_hash(struct dentry *dentry, struct qstr *qstr)
+isofs_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hash_common(dentry, qstr, 0);
 }
 static int
-isofs_hashi(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hashi_common(dentry, qstr, 0);
 }
 static int
-isofs_dentry_cmp(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmp_common(dentry, a, b, 0);
+        return isofs_dentry_cmp_common(len, str, name, 0, 0);
 }
 static int
-isofs_dentry_cmpi(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmpi_common(dentry, a, b, 0);
+        return isofs_dentry_cmp_common(len, str, name, 0, 1);
 }
 #ifdef CONFIG_JOLIET
 static int
-isofs_hash_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hash_common(dentry, qstr, 1);
 }
 static int
-isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hashi_common(dentry, qstr, 1);
 }
 static int
-isofs_dentry_cmp_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmp_common(dentry, a, b, 1);
+        return isofs_dentry_cmp_common(len, str, name, 1, 0);
 }
 static int
-isofs_dentry_cmpi_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmpi_common(dentry, a, b, 1);
+        return isofs_dentry_cmp_common(len, str, name, 1, 1);
 }
 #endif
@@ -549,6 +561,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
 }
 /*
+ * Check if root directory is empty (has less than 3 files).
+ *
+ * Used to detect broken CDs where ISO root directory is empty but Joliet root
+ * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
+ * (and Joliet used instead) or else no files would be visible.
+ */
+static bool rootdir_empty(struct super_block *sb, unsigned long block)
+{
+        int offset = 0, files = 0, de_len;
+        struct iso_directory_record *de;
+        struct buffer_head *bh;
+        bh = sb_bread(sb, block);
+        if (!bh)
+                return true;
+        while (files < 3) {
+                de = (struct iso_directory_record *) (bh->b_data + offset);
+                de_len = *(unsigned char *) de;
+                if (de_len == 0)
+                        break;
+                files++;
+                offset += de_len;
+        }
+        brelse(bh);
+        return files < 3;
+}
+/*
 * Initialize the superblock and read the root inode.
 *
 * Note: a check_disk_change() has been done immediately prior
@@ -823,6 +863,7 @@ root_found:
        sbi->s_utf8 = opt.utf8;
        sbi->s_nocompress = opt.nocompress;
        sbi->s_overriderockperm = opt.overriderockperm;
+        mutex_init(&sbi->s_mutex);
        /*
         * It would be incredibly stupid to allow people to mark every file
         * on the disk as suid, so we merely allow them to set the default
@@ -847,6 +888,18 @@ root_found:
                goto out_no_root;
        /*
+         * Fix for broken CDs with Rock Ridge and empty ISO root directory but
+         * correct Joliet root directory.
+         */
+        if (sbi->s_rock == 1 && joliet_level &&
+                                rootdir_empty(s, sbi->s_firstdatazone)) {
+                printk(KERN_NOTICE
+                        "ISOFS: primary root directory is empty. "
+                        "Disabling Rock Ridge and switching to Joliet.");
+                sbi->s_rock = 0;
+        }
+        /*
         * If this disk has both Rock Ridge and Joliet on it, then we
         * want to use Rock Ridge by default.  This can be overridden
         * by using the norock mount option.  There is still one other
@@ -886,17 +939,18 @@ root_found:
                goto out_iput;
        }
-        /* get the root dentry */
-        s->s_root = d_alloc_root(inode);
-        if (!(s->s_root))
-                goto out_no_root;
        table = 0;
        if (joliet_level)
                table += 2;
        if (opt.check == 'r')
                table++;
-        s->s_root->d_op = &isofs_dentry_ops[table];
+        s->s_d_op = &isofs_dentry_ops[table];
+        /* get the root dentry */
+        s->s_root = d_alloc_root(inode);
+        if (!(s->s_root))
+                goto out_no_root;
        kfree(opt.iocharset);
@@ -966,27 +1020,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 * or getblk() if they are not.  Returns the number of blocks inserted
 * (-ve == error.)
 */
-int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
+int isofs_get_blocks(struct inode *inode, sector_t iblock,
                     struct buffer_head **bh, unsigned long nblocks)
 {
-        unsigned long b_off;
+        unsigned long b_off = iblock;
        unsigned offset, sect_size;
        unsigned int firstext;
        unsigned long nextblk, nextoff;
-        long iblock = (long)iblock_s;
        int section, rv, error;
        struct iso_inode_info *ei = ISOFS_I(inode);
-        lock_kernel();
        error = -EIO;
        rv = 0;
-        if (iblock < 0 || iblock != iblock_s) {
+        if (iblock != b_off) {
                printk(KERN_DEBUG "%s: block number too large\n", __func__);
                goto abort;
        }
-        b_off = iblock;
        offset = 0;
        firstext = ei->i_first_extent;
@@ -1004,8 +1054,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                 * I/O errors.
                 */
                if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
-                        printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n",
+                        printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
-                                __func__, iblock, (unsigned long) inode->i_size);
+                                __func__, b_off,
+                                (unsigned long long)inode->i_size);
                        goto abort;
                }
@@ -1031,9 +1082,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                        if (++section > 100) {
                                printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
                                        " aborting...\n", __func__);
-                                printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u "
+                                printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
                                        "nextblk=%lu nextoff=%lu\n", __func__,
-                                        iblock, firstext, (unsigned) sect_size,
+                                        b_off, firstext, (unsigned) sect_size,
                                        nextblk, nextoff);
                                goto abort;
                        }
@@ -1054,7 +1105,6 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
        error = 0;
 abort:
-        unlock_kernel();
        return rv != 0 ? rv : error;
 }
@@ -1475,17 +1525,16 @@ struct inode *isofs_iget(struct super_block *sb,
        return inode;
 }
-static int isofs_get_sb(struct file_system_type *fs_type,
+static struct dentry *isofs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
-                                mnt);
 }
 static struct file_system_type iso9660_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "iso9660",
-        .get_sb         = isofs_get_sb,
+        .mount          = isofs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 7d33de84f52a..2882dc089f87 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -55,6 +55,7 @@ struct isofs_sb_info {
        gid_t s_gid;
        uid_t s_uid;
        struct nls_table *s_nls_iocharset; /* Native language support table */
+        struct mutex s_mutex; /* replaces BKL, please remove if possible */
 };
 #define ISOFS_INVALID_MODE ((mode_t) -1)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index ab438beb867c..4fb3e8074fd4 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
 *  (C) 1991  Linus Torvalds - minix filesystem
 */
-#include <linux/smp_lock.h>
 #include <linux/gfp.h>
 #include "isofs.h"
@@ -38,7 +37,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
        qstr.name = compare;
        qstr.len = dlen;
-        return dentry->d_op->d_compare(dentry, &dentry->d_name, &qstr);
+        return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
+                        dentry->d_name.len, dentry->d_name.name, &qstr);
 }
 /*
@@ -168,16 +168,15 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
        int found;
        unsigned long uninitialized_var(block);
        unsigned long uninitialized_var(offset);
+        struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
        struct inode *inode;
        struct page *page;
-        dentry->d_op = dir->i_sb->s_root->d_op;
        page = alloc_page(GFP_USER);
        if (!page)
                return ERR_PTR(-ENOMEM);
-        lock_kernel();
+        mutex_lock(&sbi->s_mutex);
        found = isofs_find_entry(dir, dentry,
                                &block, &offset,
                                page_address(page),
@@ -188,10 +187,10 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
        if (found) {
                inode = isofs_iget(dir->i_sb, block, offset);
                if (IS_ERR(inode)) {
-                        unlock_kernel();
+                        mutex_unlock(&sbi->s_mutex);
                        return ERR_CAST(inode);
                }
        }
-        unlock_kernel();
+        mutex_unlock(&sbi->s_mutex);
        return d_splice_alias(inode, dentry);
 }
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 96a685c550fd..f9cd04db6eab 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -8,7 +8,6 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include "isofs.h"
 #include "rock.h"
@@ -661,6 +660,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
 {
        struct inode *inode = page->mapping->host;
        struct iso_inode_info *ei = ISOFS_I(inode);
+        struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
        char *link = kmap(page);
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
        struct buffer_head *bh;
@@ -673,12 +673,12 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
        struct rock_state rs;
        int ret;
-        if (!ISOFS_SB(inode->i_sb)->s_rock)
+        if (!sbi->s_rock)
                goto error;
        init_rock_state(&rs, inode);
        block = ei->i_iget5_block;
-        lock_kernel();
+        mutex_lock(&sbi->s_mutex);
        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                goto out_noread;
@@ -748,7 +748,7 @@ repeat:
                goto fail;
        brelse(bh);
        *rpnt = '\0';
-        unlock_kernel();
+        mutex_unlock(&sbi->s_mutex);
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
@@ -765,7 +765,7 @@ out_bad_span:
        printk("symlink spans iso9660 blocks\n");
 fail:
        brelse(bh);
-        unlock_kernel();
+        mutex_unlock(&sbi->s_mutex);
 error:
        SetPageError(page);
        kunmap(page);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 05a38b9c4c0e..e4b87bc1fa56 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -221,7 +221,7 @@ restart:
                        goto restart;
                }
                if (buffer_locked(bh)) {
-                        atomic_inc(&bh->b_count);
+                        get_bh(bh);
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
                        wait_on_buffer(bh);
@@ -283,7 +283,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
        int ret = 0;
        if (buffer_locked(bh)) {
-                atomic_inc(&bh->b_count);
+                get_bh(bh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
                wait_on_buffer(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..34a4861c14b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -137,34 +137,10 @@ static int journal_write_commit_record(journal_t *journal,
        JBUFFER_TRACE(descriptor, "write commit block");
        set_buffer_dirty(bh);
-        if (journal->j_flags & JFS_BARRIER) {
+        if (journal->j_flags & JFS_BARRIER)
-                ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
+                ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
+        else
-                /*
-                 * Is it possible for another commit to fail at roughly
-                 * the same time as this one?  If so, we don't want to
-                 * trust the barrier flag in the super, but instead want
-                 * to remember if we sent a barrier request
-                 */
-                if (ret == -EOPNOTSUPP) {
-                        char b[BDEVNAME_SIZE];
-                        printk(KERN_WARNING
-                                "JBD: barrier-based sync failed on %s - "
-                                "disabling barriers\n",
-                                bdevname(journal->j_dev, b));
-                        spin_lock(&journal->j_state_lock);
-                        journal->j_flags &= ~JFS_BARRIER;
-                        spin_unlock(&journal->j_state_lock);
-                        /* And try again, without the barrier */
-                        set_buffer_uptodate(bh);
-                        set_buffer_dirty(bh);
-                        ret = sync_dirty_buffer(bh);
-                }
-        } else {
                ret = sync_dirty_buffer(bh);
-        }
        put_bh(bh);             /* One for getblk() */
        journal_put_journal_head(descriptor);
@@ -318,7 +294,7 @@ void journal_commit_transaction(journal_t *journal)
        int first_tag = 0;
        int tag_flag;
        int i;
-        int write_op = WRITE;
+        int write_op = WRITE_SYNC;
        /*
         * First job: lock down the current transaction and wait for
@@ -611,13 +587,13 @@ void journal_commit_transaction(journal_t *journal)
                /* Bump b_count to prevent truncate from stumbling over
                   the shadowed buffer!  @@@ This can go if we ever get
                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
-                atomic_inc(&jh2bh(jh)->b_count);
+                get_bh(jh2bh(jh));
                /* Make a temporary IO buffer with which to write it out
                   (this will requeue both the metadata buffer and the
                   temporary IO buffer). new_bh goes on BJ_IO*/
-                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+                set_buffer_jwrite(jh2bh(jh));
                /*
                 * akpm: journal_write_metadata_buffer() sets
                 * new_bh->b_transaction to commit_transaction.
@@ -627,7 +603,7 @@ void journal_commit_transaction(journal_t *journal)
                JBUFFER_TRACE(jh, "ph3: write metadata");
                flags = journal_write_metadata_buffer(commit_transaction,
                                                      jh, &new_jh, blocknr);
-                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+                set_buffer_jwrite(jh2bh(new_jh));
                wbuf[bufs++] = jh2bh(new_jh);
                /* Record the new block's tag in the current descriptor
@@ -737,7 +713,7 @@ wait_for_iobuf:
                   shadowed buffer */
                jh = commit_transaction->t_shadow_list->b_tprev;
                bh = jh2bh(jh);
-                clear_bit(BH_JWrite, &bh->b_state);
+                clear_buffer_jwrite(bh);
                J_ASSERT_BH(bh, buffer_jbddirty(bh));
                /* The metadata is now released for reuse, but we need
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2c4b1f109da9..da1b5e4ffce1 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -36,6 +36,7 @@
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
+#include <linux/ratelimit.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -84,6 +85,7 @@ EXPORT_SYMBOL(journal_force_commit);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static const char *journal_dev_name(journal_t *journal, char *buffer);
 /*
 * Helper function used to manage commit timeouts
@@ -439,7 +441,7 @@ int __log_start_commit(journal_t *journal, tid_t target)
         */
        if (!tid_geq(journal->j_commit_request, target)) {
                /*
-                 * We want a new commit: OK, mark the request and wakup the
+                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
                 */
@@ -950,6 +952,8 @@ int journal_create(journal_t *journal)
                if (err)
                        return err;
                bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+                if (unlikely(!bh))
+                        return -ENOMEM;
                lock_buffer(bh);
                memset (bh->b_data, 0, journal->j_blocksize);
                BUFFER_TRACE(bh, "marking dirty");
@@ -1010,6 +1014,23 @@ void journal_update_superblock(journal_t *journal, int wait)
                goto out;
        }
+        if (buffer_write_io_error(bh)) {
+                char b[BDEVNAME_SIZE];
+                /*
+                 * Oh, dear.  A previous attempt to write the journal
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                printk(KERN_ERR "JBD: previous I/O error detected "
+                       "for journal superblock update for %s.\n",
+                       journal_dev_name(journal, b));
+                clear_buffer_write_io_error(bh);
+                set_buffer_uptodate(bh);
+        }
        spin_lock(&journal->j_state_lock);
        jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1021,9 +1042,17 @@ void journal_update_superblock(journal_t *journal, int wait)
        BUFFER_TRACE(bh, "marking dirty");
        mark_buffer_dirty(bh);
-        if (wait)
+        if (wait) {
                sync_dirty_buffer(bh);
-        else
+                if (buffer_write_io_error(bh)) {
+                        char b[BDEVNAME_SIZE];
+                        printk(KERN_ERR "JBD: I/O error detected "
+                               "when updating journal superblock for %s.\n",
+                               journal_dev_name(journal, b));
+                        clear_buffer_write_io_error(bh);
+                        set_buffer_uptodate(bh);
+                }
+        } else
                write_dirty_buffer(bh, WRITE);
 out:
@@ -1719,7 +1748,6 @@ static void journal_destroy_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
        struct journal_head *ret;
-        static unsigned long last_warning;
 #ifdef CONFIG_JBD_DEBUG
        atomic_inc(&nr_journal_heads);
@@ -1727,11 +1755,9 @@ static struct journal_head *journal_alloc_journal_head(void)
        ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
        if (ret == NULL) {
                jbd_debug(1, "out of memory for journal_head\n");
-                if (time_after(jiffies, last_warning + 5*HZ)) {
+                printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-                        printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
+                                   __func__);
-                               __func__);
-                        last_warning = jiffies;
-                }
                while (ret == NULL) {
                        yield();
                        ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 81051dafebf5..5b43e96788e6 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -296,10 +296,10 @@ int journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD_DEBUG
                int dropped = info.end_transaction -
                              be32_to_cpu(journal->j_superblock->s_sequence);
-#endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
+#endif
                journal->j_transaction_sequence = ++info.end_transaction;
        }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5ae71e75a491..5b2e4c30a2a1 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -207,7 +207,7 @@ repeat_locked:
         * the committing transaction.  Really, we only need to give it
         * committing_transaction->t_outstanding_credits plus "enough" for
         * the log control blocks.
-         * Also, this test is inconsitent with the matching one in
+         * Also, this test is inconsistent with the matching one in
         * journal_extend().
         */
        if (__log_space_left(journal) < jbd_space_needed(journal)) {
@@ -293,9 +293,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
                jbd_free_handle(handle);
                current->journal_info = NULL;
                handle = ERR_PTR(err);
-                goto out;
        }
-out:
        return handle;
 }
@@ -528,7 +526,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        transaction = handle->h_transaction;
        journal = transaction->t_journal;
-        jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
        JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -713,7 +711,7 @@ done:
                J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
                            "Possible IO failure.\n");
                page = jh2bh(jh)->b_page;
-                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+                offset = offset_in_page(jh2bh(jh)->b_data);
                source = kmap_atomic(page, KM_USER0);
                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
                kunmap_atomic(source, KM_USER0);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5247e7ffdcb4..6a79fd0a1a32 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -299,6 +299,16 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                transaction->t_chp_stats.cs_forced_to_close++;
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
+                if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+                        /*
+                         * The journal thread is dead; so starting and
+                         * waiting for a commit to finish will cause
+                         * us to wait for a _very_ long time.
+                         */
+                        printk(KERN_ERR "JBD2: %s: "
+                               "Waiting for Godot: block %llu\n",
+                               journal->j_devname,
+                               (unsigned long long) bh->b_blocknr);
                jbd2_log_start_commit(journal, tid);
                jbd2_log_wait_commit(journal, tid);
                ret = 1;
@@ -532,8 +542,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
         */
        if ((journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
        if (!(journal->j_flags & JBD2_ABORT))
                jbd2_journal_update_superblock(journal, 1);
        return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..f3ad1598b201 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -26,7 +26,9 @@
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/bitops.h>
 #include <trace/events/jbd2.h>
+#include <asm/system.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -134,25 +136,11 @@ static int journal_submit_commit_record(journal_t *journal,
        if (journal->j_flags & JBD2_BARRIER &&
            !JBD2_HAS_INCOMPAT_FEATURE(journal,
-                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-                ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
+                ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
-                if (ret == -EOPNOTSUPP) {
+        else
-                        printk(KERN_WARNING
-                               "JBD2: Disabling barriers on %s, "
-                               "not supported by device\n", journal->j_devname);
-                        write_lock(&journal->j_state_lock);
-                        journal->j_flags &= ~JBD2_BARRIER;
-                        write_unlock(&journal->j_state_lock);
-                        /* And try again, without the barrier */
-                        lock_buffer(bh);
-                        set_buffer_uptodate(bh);
-                        clear_buffer_dirty(bh);
-                        ret = submit_bh(WRITE_SYNC_PLUG, bh);
-                }
-        } else {
                ret = submit_bh(WRITE_SYNC_PLUG, bh);
-        }
        *cbh = bh;
        return ret;
 }
@@ -166,29 +154,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
 {
        int ret = 0;
-retry:
        clear_buffer_dirty(bh);
        wait_on_buffer(bh);
-        if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
-                printk(KERN_WARNING
-                       "JBD2: %s: disabling barries on %s - not supported "
-                       "by device\n", __func__, journal->j_devname);
-                write_lock(&journal->j_state_lock);
-                journal->j_flags &= ~JBD2_BARRIER;
-                write_unlock(&journal->j_state_lock);
-                lock_buffer(bh);
-                clear_buffer_dirty(bh);
-                set_buffer_uptodate(bh);
-                bh->b_end_io = journal_end_buffer_io_sync;
-                ret = submit_bh(WRITE_SYNC_PLUG, bh);
-                if (ret) {
-                        unlock_buffer(bh);
-                        return ret;
-                }
-                goto retry;
-        }
        if (unlikely(!buffer_uptodate(bh)))
                ret = -EIO;
@@ -236,7 +203,7 @@ static int journal_submit_data_buffers(journal_t *journal,
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
                mapping = jinode->i_vfs_inode->i_mapping;
-                jinode->i_flags |= JI_COMMIT_RUNNING;
+                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                spin_unlock(&journal->j_list_lock);
                /*
                 * submit the inode data buffers. We use writepage
@@ -251,7 +218,8 @@ static int journal_submit_data_buffers(journal_t *journal,
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
                commit_transaction->t_flushed_data_blocks = 1;
-                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+                smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
        spin_unlock(&journal->j_list_lock);
@@ -272,7 +240,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
        /* For locking, see the comment in journal_submit_data_buffers() */
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-                jinode->i_flags |= JI_COMMIT_RUNNING;
+                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                spin_unlock(&journal->j_list_lock);
                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
                if (err) {
@@ -288,7 +256,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
                                ret = err;
                }
                spin_lock(&journal->j_list_lock);
-                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+                smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
@@ -360,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
-        int write_op = WRITE;
+        int write_op = WRITE_SYNC;
        /*
         * First job: lock down the current transaction and wait for
@@ -701,6 +670,16 @@ start_journal_io:
                }
        }
+        err = journal_finish_inode_data_buffers(journal, commit_transaction);
+        if (err) {
+                printk(KERN_WARNING
+                        "JBD2: Detected IO errors while flushing file data "
+                       "on %s\n", journal->j_devname);
+                if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+                        jbd2_journal_abort(journal, err);
+                err = 0;
+        }
        /* 
         * If the journal is not located on the file system device,
         * then we must flush the file system device before we issue
@@ -709,8 +688,7 @@ start_journal_io:
        if (commit_transaction->t_flushed_data_blocks &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -719,19 +697,6 @@ start_journal_io:
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
-                if (journal->j_flags & JBD2_BARRIER)
-                        blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
-                                BLKDEV_IFL_WAIT);
-        }
-        err = journal_finish_inode_data_buffers(journal, commit_transaction);
-        if (err) {
-                printk(KERN_WARNING
-                        "JBD2: Detected IO errors while flushing file data "
-                       "on %s\n", journal->j_devname);
-                if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
-                        jbd2_journal_abort(journal, err);
-                err = 0;
        }
        /* Lo and behold: we have just managed to send a transaction to
@@ -845,6 +810,11 @@ wait_for_iobuf:
        }
        if (!err && !is_journal_aborted(journal))
                err = journal_wait_on_commit_record(journal, cbh);
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+            journal->j_flags & JBD2_BARRIER) {
+                blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+        }
        if (err)
                jbd2_journal_abort(journal, err);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0e8014ea6b94..9e4686900f18 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -42,12 +42,15 @@
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
 #include <linux/backing-dev.h>
+#include <linux/bitops.h>
+#include <linux/ratelimit.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
+#include <asm/system.h>
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
@@ -91,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_inode_cache);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -478,7 +482,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
         */
        if (!tid_geq(journal->j_commit_request, target)) {
                /*
-                 * We want a new commit: OK, mark the request and wakup the
+                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
                 */
@@ -825,7 +829,7 @@ static journal_t * journal_init_common (void)
        journal = kzalloc(sizeof(*journal), GFP_KERNEL);
        if (!journal)
-                goto fail;
+                return NULL;
        init_waitqueue_head(&journal->j_wait_transaction_locked);
        init_waitqueue_head(&journal->j_wait_logspace);
@@ -850,14 +854,12 @@ static journal_t * journal_init_common (void)
        err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
        if (err) {
                kfree(journal);
-                goto fail;
+                return NULL;
        }
        spin_lock_init(&journal->j_history_lock);
        return journal;
-fail:
-        return NULL;
 }
 /* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -897,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        /* journal descriptor can store up to n blocks -bzzz */
        journal->j_blocksize = blocksize;
+        journal->j_dev = bdev;
+        journal->j_fs_dev = fs_dev;
+        journal->j_blk_offset = start;
+        journal->j_maxlen = len;
+        bdevname(journal->j_dev, journal->j_devname);
+        p = journal->j_devname;
+        while ((p = strchr(p, '/')))
+                *p = '!';
        jbd2_stats_proc_init(journal);
        n = journal->j_blocksize / sizeof(journal_block_tag_t);
        journal->j_wbufsize = n;
@@ -906,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
                        __func__);
                goto out_err;
        }
-        journal->j_dev = bdev;
-        journal->j_fs_dev = fs_dev;
-        journal->j_blk_offset = start;
-        journal->j_maxlen = len;
-        bdevname(journal->j_dev, journal->j_devname);
-        p = journal->j_devname;
-        while ((p = strchr(p, '/')))
-                *p = '!';
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
        if (!bh) {
@@ -1371,6 +1373,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
        if (!compat && !ro && !incompat)
                return 1;
+        /* Load journal superblock if it is not loaded yet. */
+        if (journal->j_format_version == 0 &&
+            journal_get_superblock(journal) != 0)
+                return 0;
        if (journal->j_format_version == 1)
                return 0;
@@ -1832,7 +1838,6 @@ size_t journal_tag_bytes(journal_t *journal)
 */
 #define JBD2_MAX_SLABS 8
 static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
-static DECLARE_MUTEX(jbd2_slab_create_sem);
 static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
        "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
@@ -1853,6 +1858,7 @@ static void jbd2_journal_destroy_slabs(void)
 static int jbd2_journal_create_slab(size_t size)
 {
+        static DEFINE_MUTEX(jbd2_slab_create_mutex);
        int i = order_base_2(size) - 10;
        size_t slab_size;
@@ -1864,16 +1870,16 @@ static int jbd2_journal_create_slab(size_t size)
        if (unlikely(i < 0))
                i = 0;
-        down(&jbd2_slab_create_sem);
+        mutex_lock(&jbd2_slab_create_mutex);
        if (jbd2_slab[i]) {
-                up(&jbd2_slab_create_sem);
+                mutex_unlock(&jbd2_slab_create_mutex);
                return 0;       /* Already created */
        }
        slab_size = 1 << (i+10);
        jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
                                         slab_size, 0, NULL);
-        up(&jbd2_slab_create_sem);
+        mutex_unlock(&jbd2_slab_create_mutex);
        if (!jbd2_slab[i]) {
                printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
                return -ENOMEM;
@@ -1976,7 +1982,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
        struct journal_head *ret;
-        static unsigned long last_warning;
 #ifdef CONFIG_JBD2_DEBUG
        atomic_inc(&nr_journal_heads);
@@ -1984,11 +1989,7 @@ static struct journal_head *journal_alloc_journal_head(void)
        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
        if (!ret) {
                jbd_debug(1, "out of memory for journal_head\n");
-                if (time_after(jiffies, last_warning + 5*HZ)) {
+                pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
-                        printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-                               __func__);
-                        last_warning = jiffies;
-                }
                while (!ret) {
                        yield();
                        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -2206,7 +2207,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
 restart:
        spin_lock(&journal->j_list_lock);
        /* Is commit writing out inode - we have to wait */
-        if (jinode->i_flags & JI_COMMIT_RUNNING) {
+        if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -2286,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
 #endif
-struct kmem_cache *jbd2_handle_cache;
+struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
 static int __init journal_init_handle_cache(void)
 {
-        jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
+        jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
-                                sizeof(handle_t),
-                                0,              /* offset */
-                                SLAB_TEMPORARY, /* flags */
-                                NULL);          /* ctor */
        if (jbd2_handle_cache == NULL) {
-                printk(KERN_EMERG "JBD: failed to create handle cache\n");
+                printk(KERN_EMERG "JBD2: failed to create handle cache\n");
+                return -ENOMEM;
+        }
+        jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+        if (jbd2_inode_cache == NULL) {
+                printk(KERN_EMERG "JBD2: failed to create inode cache\n");
+                kmem_cache_destroy(jbd2_handle_cache);
                return -ENOMEM;
        }
        return 0;
@@ -2306,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void)
 {
        if (jbd2_handle_cache)
                kmem_cache_destroy(jbd2_handle_cache);
+        if (jbd2_inode_cache)
+                kmem_cache_destroy(jbd2_inode_cache);
 }
 /*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 2bc4d5f116f1..1cad869494f0 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD2_DEBUG
                int dropped = info.end_transaction - 
                        be32_to_cpu(journal->j_superblock->s_sequence);
-#endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
+#endif
                journal->j_transaction_sequence = ++info.end_transaction;
        }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index f3479d6e0a83..faad2bd787c7 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -156,6 +156,7 @@ alloc_transaction:
         */
 repeat:
        read_lock(&journal->j_state_lock);
+        BUG_ON(journal->j_flags & JBD2_UNMOUNT);
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
                read_unlock(&journal->j_state_lock);
@@ -250,7 +251,7 @@ repeat:
         * the committing transaction.  Really, we only need to give it
         * committing_transaction->t_outstanding_credits plus "enough" for
         * the log control blocks.
-         * Also, this test is inconsitent with the matching one in
+         * Also, this test is inconsistent with the matching one in
         * jbd2_journal_extend().
         */
        if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
@@ -339,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
                jbd2_free_handle(handle);
                current->journal_info = NULL;
                handle = ERR_PTR(err);
-                goto out;
        }
-out:
        return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -588,7 +587,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        transaction = handle->h_transaction;
        journal = transaction->t_journal;
-        jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
        JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -773,7 +772,7 @@ done:
                J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
                            "Possible IO failure.\n");
                page = jh2bh(jh)->b_page;
-                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+                offset = offset_in_page(jh2bh(jh)->b_data);
                source = kmap_atomic(page, KM_USER0);
                /* Fire data frozen trigger just before we copy the data */
                jbd2_buffer_frozen_trigger(jh, source + offset,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 54a92fd02bbd..95b79672150a 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -259,11 +259,14 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        return rc;
 }
-int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int rc;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 5e42de8d9541..3119f59253d3 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-extern int jffs2_check_acl(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int, unsigned int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a906f538d11c..3005ec4520ad 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -23,7 +23,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
 static inline struct jffs2_inode_cache *
 first_inode_chain(int *i, struct jffs2_sb_info *c)
 {
-        for (; *i < INOCACHE_HASHSIZE; (*i)++) {
+        for (; *i < c->inocache_hashsize; (*i)++) {
                if (c->inocache_list[*i])
                        return c->inocache_list[*i];
        }
@@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
        size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
 #ifndef __ECOS
        if (jffs2_blocks_use_vmalloc(c))
-                c->blocks = vmalloc(size);
+                c->blocks = vzalloc(size);
        else
 #endif
-                c->blocks = kmalloc(size, GFP_KERNEL);
+                c->blocks = kzalloc(size, GFP_KERNEL);
        if (!c->blocks)
                return -ENOMEM;
-        memset(c->blocks, 0, size);
        for (i=0; i<c->nr_blocks; i++) {
                INIT_LIST_HEAD(&c->blocks[i].list);
                c->blocks[i].offset = i * c->sector_size;
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 617a1e5694c1..de4247021d25 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -103,7 +103,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                        spin_unlock(&jffs2_compressor_list_lock);
                        *datalen  = orig_slen;
                        *cdatalen = orig_dlen;
-                        compr_ret = this->compress(data_in, output_buf, datalen, cdatalen, NULL);
+                        compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
                        spin_lock(&jffs2_compressor_list_lock);
                        this->usecount--;
                        if (!compr_ret) {
@@ -152,7 +152,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                        spin_unlock(&jffs2_compressor_list_lock);
                        *datalen  = orig_slen;
                        *cdatalen = orig_dlen;
-                        compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen, NULL);
+                        compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen);
                        spin_lock(&jffs2_compressor_list_lock);
                        this->usecount--;
                        if (!compr_ret) {
@@ -220,7 +220,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                        if (comprtype == this->compr) {
                                this->usecount++;
                                spin_unlock(&jffs2_compressor_list_lock);
-                                ret = this->decompress(cdata_in, data_out, cdatalen, datalen, NULL);
+                                ret = this->decompress(cdata_in, data_out, cdatalen, datalen);
                                spin_lock(&jffs2_compressor_list_lock);
                                if (ret) {
                                        printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret);
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index e471a9106fd9..13bb7597ab39 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -49,9 +49,9 @@ struct jffs2_compressor {
        char *name;
        char compr;                     /* JFFS2_COMPR_XXX */
        int (*compress)(unsigned char *data_in, unsigned char *cpage_out,
-                        uint32_t *srclen, uint32_t *destlen, void *model);
+                        uint32_t *srclen, uint32_t *destlen);
        int (*decompress)(unsigned char *cdata_in, unsigned char *data_out,
-                          uint32_t cdatalen, uint32_t datalen, void *model);
+                          uint32_t cdatalen, uint32_t datalen);
        int usecount;
        int disabled;           /* if set the compressor won't compress */
        unsigned char *compr_buf;       /* used by size compr. mode */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index ed25ae7c98eb..af186ee674d8 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -42,7 +42,7 @@ static int __init alloc_workspace(void)
 }
 static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
-                              uint32_t *sourcelen, uint32_t *dstlen, void *model)
+                              uint32_t *sourcelen, uint32_t *dstlen)
 {
        size_t compress_size;
        int ret;
@@ -67,7 +67,7 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
 }
 static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
-                                 uint32_t srclen, uint32_t destlen, void *model)
+                                 uint32_t srclen, uint32_t destlen)
 {
        size_t dl = destlen;
        int ret;
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 9696ad9ef5f7..16a5047903a6 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -31,8 +31,7 @@
 /* _compress returns the compressed size, -1 if bigger */
 static int jffs2_rtime_compress(unsigned char *data_in,
                                unsigned char *cpage_out,
-                                uint32_t *sourcelen, uint32_t *dstlen,
+                                uint32_t *sourcelen, uint32_t *dstlen)
-                                void *model)
 {
        short positions[256];
        int outpos = 0;
@@ -73,8 +72,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
 static int jffs2_rtime_decompress(unsigned char *data_in,
                                  unsigned char *cpage_out,
-                                  uint32_t srclen, uint32_t destlen,
+                                  uint32_t srclen, uint32_t destlen)
-                                  void *model)
 {
        short positions[256];
        int outpos = 0;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index a12b4f763373..9e7cec808c4c 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -298,7 +298,7 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
 #if 0
 /* _compress returns the compressed size, -1 if bigger */
 int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
-                   uint32_t *sourcelen, uint32_t *dstlen, void *model)
+                   uint32_t *sourcelen, uint32_t *dstlen)
 {
        return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
                                 cpage_out, sourcelen, dstlen);
@@ -306,8 +306,7 @@ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
 #endif
 static int jffs2_dynrubin_compress(unsigned char *data_in,
                                   unsigned char *cpage_out,
-                                   uint32_t *sourcelen, uint32_t *dstlen,
+                                   uint32_t *sourcelen, uint32_t *dstlen)
-                                   void *model)
 {
        int bits[8];
        unsigned char histo[256];
@@ -387,8 +386,7 @@ static void rubin_do_decompress(int bit_divider, int *bits,
 static int jffs2_rubinmips_decompress(unsigned char *data_in,
                                      unsigned char *cpage_out,
-                                      uint32_t sourcelen, uint32_t dstlen,
+                                      uint32_t sourcelen, uint32_t dstlen)
-                                      void *model)
 {
        rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
                            cpage_out, sourcelen, dstlen);
@@ -397,8 +395,7 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
 static int jffs2_dynrubin_decompress(unsigned char *data_in,
                                     unsigned char *cpage_out,
-                                     uint32_t sourcelen, uint32_t dstlen,
+                                     uint32_t sourcelen, uint32_t dstlen)
-                                     void *model)
 {
        int bits[8];
        int c;
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 97fc45de6f81..fd05a0b9431d 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -68,8 +68,7 @@ static void free_workspaces(void)
 static int jffs2_zlib_compress(unsigned char *data_in,
                               unsigned char *cpage_out,
-                               uint32_t *sourcelen, uint32_t *dstlen,
+                               uint32_t *sourcelen, uint32_t *dstlen)
-                               void *model)
 {
        int ret;
@@ -136,8 +135,7 @@ static int jffs2_zlib_compress(unsigned char *data_in,
 static int jffs2_zlib_decompress(unsigned char *data_in,
                                 unsigned char *cpage_out,
-                                 uint32_t srclen, uint32_t destlen,
+                                 uint32_t srclen, uint32_t destlen)
-                                 void *model)
 {
        int ret;
        int wbits = MAX_WBITS;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ed78a3cf3cb0..92978658ed18 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -289,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
                mutex_unlock(&f->sem);
                d_instantiate(dentry, old_dentry->d_inode);
                dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
        }
        return ret;
 }
@@ -367,7 +367,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        }
        /* We use f->target field to store the target path. */
-        f->target = kmalloc(targetlen + 1, GFP_KERNEL);
+        f->target = kmemdup(target, targetlen + 1, GFP_KERNEL);
        if (!f->target) {
                printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
                mutex_unlock(&f->sem);
@@ -376,7 +376,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                goto fail;
        }
-        memcpy(f->target, target, targetlen + 1);
        D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target));
        /* No data here. Only a metadata node, which will be
@@ -864,7 +863,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
                printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
                /* Might as well let the VFS know */
                d_instantiate(new_dentry, old_dentry->d_inode);
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
                new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
                return ret;
        }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index abac961f617b..e513f1913c15 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -151,7 +151,7 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
                }
                /* Be nice */
-                yield();
+                cond_resched();
                mutex_lock(&c->erase_free_sem);
                spin_lock(&c->erase_completion_lock);
        }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 6b2964a19850..e896e67767eb 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -21,7 +21,6 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 #include "nodelist.h"
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -391,7 +390,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
           This also catches the case where it was stopped and this
           is just a remount to restart it.
           Flush the writebuffer, if neccecary, else we loose it */
-        lock_kernel();
        if (!(sb->s_flags & MS_RDONLY)) {
                jffs2_stop_garbage_collect_thread(c);
                mutex_lock(&c->alloc_sem);
@@ -403,8 +401,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
                jffs2_start_garbage_collect_thread(c);
        *flags |= MS_NOATIME;
-        unlock_kernel();
        return 0;
 }
@@ -478,6 +474,25 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
        return inode;
 }
+static int calculate_inocache_hashsize(uint32_t flash_size)
+{
+        /*
+         * Pick a inocache hash size based on the size of the medium.
+         * Count how many megabytes we're dealing with, apply a hashsize twice
+         * that size, but rounding down to the usual big powers of 2. And keep
+         * to sensible bounds.
+         */
+        int size_mb = flash_size / 1024 / 1024;
+        int hashsize = (size_mb * 2) & ~0x3f;
+        if (hashsize < INOCACHE_HASHSIZE_MIN)
+                return INOCACHE_HASHSIZE_MIN;
+        if (hashsize > INOCACHE_HASHSIZE_MAX)
+                return INOCACHE_HASHSIZE_MAX;
+        return hashsize;
+}
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 {
@@ -524,7 +539,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
        if (ret)
                return ret;
-        c->inocache_list = kcalloc(INOCACHE_HASHSIZE, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
+        c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size);
+        c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
        if (!c->inocache_list) {
                ret = -ENOMEM;
                goto out_wbuf;
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 846a79452497..31dce611337c 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -219,13 +219,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
        if (!list_empty(&c->erase_complete_list) ||
            !list_empty(&c->erase_pending_list)) {
                spin_unlock(&c->erase_completion_lock);
+                mutex_unlock(&c->alloc_sem);
                D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
-                if (jffs2_erase_pending_blocks(c, 1)) {
+                if (jffs2_erase_pending_blocks(c, 1))
-                        mutex_unlock(&c->alloc_sem);
                        return 0;
-                }
                D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
                spin_lock(&c->erase_completion_lock);
+                mutex_lock(&c->alloc_sem);
        }
        /* First, work out which block we're garbage-collecting */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 6784bc89add1..0bc6a6c80a56 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -100,6 +100,7 @@ struct jffs2_sb_info {
        wait_queue_head_t erase_wait;           /* For waiting for erases to complete */
        wait_queue_head_t inocache_wq;
+        int inocache_hashsize;
        struct jffs2_inode_cache **inocache_list;
        spinlock_t inocache_lock;
@@ -143,4 +144,4 @@ struct jffs2_sb_info {
        void *os_priv;
 };
-#endif /* _JFFS2_FB_SB */
+#endif /* _JFFS2_FS_SB */
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index af02bd138469..5e03233c2363 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -420,7 +420,7 @@ struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t
 {
        struct jffs2_inode_cache *ret;
-        ret = c->inocache_list[ino % INOCACHE_HASHSIZE];
+        ret = c->inocache_list[ino % c->inocache_hashsize];
        while (ret && ret->ino < ino) {
                ret = ret->next;
        }
@@ -441,7 +441,7 @@ void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new
        dbg_inocache("add %p (ino #%u)\n", new, new->ino);
-        prev = &c->inocache_list[new->ino % INOCACHE_HASHSIZE];
+        prev = &c->inocache_list[new->ino % c->inocache_hashsize];
        while ((*prev) && (*prev)->ino < new->ino) {
                prev = &(*prev)->next;
@@ -462,7 +462,7 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
        dbg_inocache("del %p (ino #%u)\n", old, old->ino);
        spin_lock(&c->inocache_lock);
-        prev = &c->inocache_list[old->ino % INOCACHE_HASHSIZE];
+        prev = &c->inocache_list[old->ino % c->inocache_hashsize];
        while ((*prev) && (*prev)->ino < old->ino) {
                prev = &(*prev)->next;
@@ -487,7 +487,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
        int i;
        struct jffs2_inode_cache *this, *next;
-        for (i=0; i<INOCACHE_HASHSIZE; i++) {
+        for (i=0; i < c->inocache_hashsize; i++) {
                this = c->inocache_list[i];
                while (this) {
                        next = this->next;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 523a91691052..5a53d9bdb2b5 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -199,7 +199,8 @@ struct jffs2_inode_cache {
 #define RAWNODE_CLASS_XATTR_DATUM       1
 #define RAWNODE_CLASS_XATTR_REF         2
-#define INOCACHE_HASHSIZE 128
+#define INOCACHE_HASHSIZE_MIN 128
+#define INOCACHE_HASHSIZE_MAX 1024
 #define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 46f870d1cc36..b632dddcb482 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -20,7 +20,7 @@
 #include "summary.h"
 #include "debug.h"
-#define DEFAULT_EMPTY_SCAN_SIZE 1024
+#define DEFAULT_EMPTY_SCAN_SIZE 256
 #define noisy_printk(noise, args...) do { \
        if (*(noise)) { \
@@ -435,7 +435,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
                                  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
        struct jffs2_unknown_node *node;
        struct jffs2_unknown_node crcnode;
-        uint32_t ofs, prevofs;
+        uint32_t ofs, prevofs, max_ofs;
        uint32_t hdr_crc, buf_ofs, buf_len;
        int err;
        int noise = 0;
@@ -550,12 +550,12 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
        /* We temporarily use 'ofs' as a pointer into the buffer/jeb */
        ofs = 0;
+        max_ofs = EMPTY_SCAN_SIZE(c->sector_size);
-        /* Scan only 4KiB of 0xFF before declaring it's empty */
+        /* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */
-        while(ofs < EMPTY_SCAN_SIZE(c->sector_size) && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
+        while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
                ofs += 4;
-        if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) {
+        if (ofs == max_ofs) {
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
                if (jffs2_cleanmarker_oob(c)) {
                        /* scan oob, take care of cleanmarker */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 662bba099501..853b8e300084 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/fs.h>
@@ -41,11 +40,18 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
        return &f->vfs_inode;
 }
-static void jffs2_destroy_inode(struct inode *inode)
+static void jffs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
+static void jffs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, jffs2_i_callback);
+}
 static void jffs2_i_init_once(void *foo)
 {
        struct jffs2_inode_info *f = foo;
@@ -146,6 +152,7 @@ static const struct super_operations jffs2_super_operations =
 static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct jffs2_sb_info *c;
+        int ret;
        D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
                  " New superblock for device %d (\"%s\")\n",
@@ -175,15 +182,15 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
        sb->s_flags |= MS_POSIXACL;
 #endif
-        return jffs2_do_fill_super(sb, data, silent);
+        ret = jffs2_do_fill_super(sb, data, silent);
+        return ret;
 }
-static int jffs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *jffs2_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super,
+        return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
-                          mnt);
 }
 static void jffs2_put_super (struct super_block *sb)
@@ -192,8 +199,6 @@ static void jffs2_put_super (struct super_block *sb)
        D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
-        lock_kernel();
        if (sb->s_dirt)
                jffs2_write_super(sb);
@@ -215,8 +220,6 @@ static void jffs2_put_super (struct super_block *sb)
        if (c->mtd->sync)
                c->mtd->sync(c->mtd);
-        unlock_kernel();
        D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
@@ -232,7 +235,7 @@ static void jffs2_kill_sb(struct super_block *sb)
 static struct file_system_type jffs2_fs_type = {
        .owner =        THIS_MODULE,
        .name =         "jffs2",
-        .get_sb =       jffs2_get_sb,
+        .mount =        jffs2_mount,
        .kill_sb =      jffs2_kill_sb,
 };
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9b572ca40a49..4f9cc0482949 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
                JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
                            offset, je32_to_cpu(rx.hdr_crc), crc);
                xd->flags |= JFFS2_XFLAGS_INVALID;
-                return EIO;
+                return -EIO;
        }
        totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
        if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
@@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
                            je32_to_cpu(rx.xid), xd->xid,
                            je32_to_cpu(rx.version), xd->version);
                xd->flags |= JFFS2_XFLAGS_INVALID;
-                return EIO;
+                return -EIO;
        }
        xd->xprefix = rx.xprefix;
        xd->name_len = rx.name_len;
@@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
                              ref_offset(xd->node), xd->data_crc, crc);
                kfree(data);
                xd->flags |= JFFS2_XFLAGS_INVALID;
-                return EIO;
+                return -EIO;
        }
        xd->flags |= JFFS2_XFLAGS_HOT;
@@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
        if (xd->xname)
                return 0;
        if (xd->flags & JFFS2_XFLAGS_INVALID)
-                return EIO;
+                return -EIO;
        if (unlikely(is_xattr_datum_unchecked(c, xd)))
                rc = do_verify_xattr_datum(c, xd);
        if (!rc)
@@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
        if (crc != je32_to_cpu(rr.node_crc)) {
                JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
                            offset, je32_to_cpu(rr.node_crc), crc);
-                return EIO;
+                return -EIO;
        }
        if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
            || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
@@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
                            offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
                            je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
                            je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
-                return EIO;
+                return -EIO;
        }
        ref->ino = je32_to_cpu(rr.ino);
        ref->xid = je32_to_cpu(rr.xid);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 1057a4998e4e..e5de9422fa32 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,10 +114,14 @@ out:
        return rc;
 }
-int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 54e07559878d..f9285c4900fa 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 #ifdef CONFIG_JFS_POSIX_ACL
-int jfs_check_acl(struct inode *, int);
+int jfs_check_acl(struct inode *, int, unsigned int flags);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_acl_chmod(struct inode *inode);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc8eeb2..3a09423b6c22 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
         * appear hashed, but do not put on any lists.  hlist_del()
         * will work fine and require no locking.
         */
-        ip->i_hash.pprev = &ip->i_hash.next;
+        hlist_add_fake(&ip->i_hash);
        return (ip);
 }
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c51af2a14516..278e3fb40b71 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1010,15 +1010,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
                 * option 2 - shutdown file systems
                 *            associated with log ?
                 * option 3 - extend log ?
-                 */
-                /*
                 * option 4 - second chance
                 *
                 * mark log wrapped, and continue.
                 * when all active transactions are completed,
-                 * mark log vaild for recovery.
+                 * mark log valid for recovery.
                 * if crashed during invalid state, log state
-                 * implies invald log, forcing fsck().
+                 * implies invalid log, forcing fsck().
                 */
                /* mark log state log wrap in log superblock */
                /* log->state = LOGWRAP; */
@@ -1122,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
         * file systems to log may have n-to-1 relationship;
         */
-        bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                 log);
        if (IS_ERR(bdev)) {
                rc = -PTR_ERR(bdev);
                goto free;
        }
-        if ((rc = bd_claim(bdev, log))) {
-                goto close;
-        }
        log->bdev = bdev;
        memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
@@ -1139,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
         * initialize log:
         */
        if ((rc = lmLogInit(log)))
-                goto unclaim;
+                goto close;
        list_add(&log->journal_list, &jfs_external_logs);
@@ -1165,11 +1160,8 @@ journal_found:
        list_del(&log->journal_list);
        lbmLogShutdown(log);
-      unclaim:
-        bd_release(bdev);
      close:            /* close external log device */
-        blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
      free:             /* free log descriptor */
        mutex_unlock(&jfs_log_mutex);
@@ -1514,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
        bdev = log->bdev;
        rc = lmLogShutdown(log);
-        bd_release(bdev);
+        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
        kfree(log);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 7b698f2ec45a..9895595fd2f2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -97,7 +97,7 @@ int jfs_mount(struct super_block *sb)
        ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
        if (ipaimap == NULL) {
-                jfs_err("jfs_mount: Faild to read AGGREGATE_I");
+                jfs_err("jfs_mount: Failed to read AGGREGATE_I");
                rc = -EIO;
                goto errout20;
        }
@@ -148,7 +148,7 @@ int jfs_mount(struct super_block *sb)
        if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
                ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
                if (!ipaimap2) {
-                        jfs_err("jfs_mount: Faild to read AGGREGATE_I");
+                        jfs_err("jfs_mount: Failed to read AGGREGATE_I");
                        rc = -EIO;
                        goto errout35;
                }
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea76b445..9466957ec841 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
         * lazy commit thread finishes processing
         */
        if (tblk->xflag & COMMIT_DELETE) {
-                atomic_inc(&tblk->u.ip->i_count);
+                ihold(tblk->u.ip);
                /*
                 * Avoid a rare deadlock
                 *
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8675be..81ead850ddb6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -18,6 +18,7 @@
 */
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/ctype.h>
 #include <linux/quotaops.h>
 #include <linux/exportfs.h>
@@ -839,7 +840,7 @@ static int jfs_link(struct dentry *old_dentry,
        ip->i_ctime = CURRENT_TIME;
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        mark_inode_dirty(dir);
-        atomic_inc(&ip->i_count);
+        ihold(ip);
        iplist[0] = ip;
        iplist[1] = dir;
@@ -1464,9 +1465,6 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
        jfs_info("jfs_lookup: name = %s", name);
-        if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
-                dentry->d_op = &jfs_ci_dentry_operations;
        if ((name[0] == '.') && (len == 1))
                inum = dip->i_ino;
        else if (strcmp(name, "..") == 0)
@@ -1491,12 +1489,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
                return ERR_CAST(ip);
        }
-        dentry = d_splice_alias(ip, dentry);
+        return d_splice_alias(ip, dentry);
-        if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
-                dentry->d_op = &jfs_ci_dentry_operations;
-        return dentry;
 }
 static struct inode *jfs_nfs_get_inode(struct super_block *sb,
@@ -1573,7 +1566,8 @@ const struct file_operations jfs_dir_operations = {
        .llseek         = generic_file_llseek,
 };
-static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
+static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
+                struct qstr *this)
 {
        unsigned long hash;
        int i;
@@ -1586,32 +1580,63 @@ static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
        return 0;
 }
-static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b)
+static int jfs_ci_compare(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        int i, result = 1;
-        if (a->len != b->len)
+        if (len != name->len)
                goto out;
-        for (i=0; i < a->len; i++) {
+        for (i=0; i < len; i++) {
-                if (tolower(a->name[i]) != tolower(b->name[i]))
+                if (tolower(str[i]) != tolower(name->name[i]))
                        goto out;
        }
        result = 0;
+out:
+        return result;
+}
+static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /*
-         * We want creates to preserve case.  A negative dentry, a, that
+         * This is not negative dentry. Always valid.
-         * has a different case than b may cause a new entry to be created
+         *
-         * with the wrong case.  Since we can't tell if a comes from a negative
+         * Note, rename() to existing directory entry will have ->d_inode,
-         * dentry, we blindly replace it with b.  This should be harmless if
+         * and will use existing name which isn't specified name by user.
-         * a is not a negative dentry.
+         *
+         * We may be able to drop this positive dentry here. But dropping
+         * positive dentry isn't good idea. So it's unsupported like
+         * rename("filename", "FILENAME") for now.
         */
-        memcpy((unsigned char *)a->name, b->name, a->len);
+        if (dentry->d_inode)
-out:
+                return 1;
-        return result;
+        /*
+         * This may be nfsd (or something), anyway, we can't see the
+         * intent of this. So, since this can be for creation, drop it.
+         */
+        if (!nd)
+                return 0;
+        /*
+         * Drop the negative dentry, in order to make sure to use the
+         * case sensitive name which is specified by user if this is
+         * for creation.
+         */
+        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+                        return 0;
+        }
+        return 1;
 }
 const struct dentry_operations jfs_ci_dentry_operations =
 {
        .d_hash = jfs_ci_hash,
        .d_compare = jfs_ci_compare,
+        .d_revalidate = jfs_ci_revalidate,
 };
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index ec8c3e4baca3..eeca48a031ab 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -33,7 +33,6 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -116,6 +115,14 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
        return &jfs_inode->vfs_inode;
 }
+static void jfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(jfs_inode_cachep, ji);
+}
 static void jfs_destroy_inode(struct inode *inode)
 {
        struct jfs_inode_info *ji = JFS_IP(inode);
@@ -129,7 +136,7 @@ static void jfs_destroy_inode(struct inode *inode)
                ji->active_ag = -1;
        }
        spin_unlock_irq(&ji->ag_lock);
-        kmem_cache_free(jfs_inode_cachep, ji);
+        call_rcu(&inode->i_rcu, jfs_i_callback);
 }
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -176,8 +183,6 @@ static void jfs_put_super(struct super_block *sb)
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-        lock_kernel();
        rc = jfs_umount(sb);
        if (rc)
                jfs_err("jfs_umount failed with return code %d", rc);
@@ -188,8 +193,6 @@ static void jfs_put_super(struct super_block *sb)
        iput(sbi->direct_inode);
        kfree(sbi);
-        unlock_kernel();
 }
 enum {
@@ -369,19 +372,16 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
        if (!parse_options(data, sb, &newLVSize, &flag)) {
                return -EINVAL;
        }
-        lock_kernel();
        if (newLVSize) {
                if (sb->s_flags & MS_RDONLY) {
                        printk(KERN_ERR
                  "JFS: resize requires volume to be mounted read-write\n");
-                        unlock_kernel();
                        return -EROFS;
                }
                rc = jfs_extendfs(sb, newLVSize, 0);
-                if (rc) {
+                if (rc)
-                        unlock_kernel();
                        return rc;
-                }
        }
        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -397,36 +397,30 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
                /* mark the fs r/w for quota activity */
                sb->s_flags &= ~MS_RDONLY;
-                unlock_kernel();
                dquot_resume(sb, -1);
                return ret;
        }
        if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
                rc = dquot_suspend(sb, -1);
                if (rc < 0) {
-                        unlock_kernel();
                        return rc;
                }
                rc = jfs_umount_rw(sb);
                JFS_SBI(sb)->flag = flag;
-                unlock_kernel();
                return rc;
        }
        if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
                if (!(sb->s_flags & MS_RDONLY)) {
                        rc = jfs_umount_rw(sb);
-                        if (rc) {
+                        if (rc)
-                                unlock_kernel();
                                return rc;
-                        }
                        JFS_SBI(sb)->flag = flag;
                        ret = jfs_mount_rw(sb, 1);
-                        unlock_kernel();
                        return ret;
                }
        JFS_SBI(sb)->flag = flag;
-        unlock_kernel();
        return 0;
 }
@@ -446,6 +440,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
        sbi->sb = sb;
        sbi->uid = sbi->gid = sbi->umask = -1;
@@ -520,6 +515,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = JFS_SUPER_MAGIC;
+        if (sbi->mntflag & JFS_OS2)
+                sb->s_d_op = &jfs_ci_dentry_operations;
        inode = jfs_iget(sb, ROOT_I);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
@@ -529,9 +527,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sb->s_root)
                goto out_no_root;
-        if (sbi->mntflag & JFS_OS2)
-                sb->s_root->d_op = &jfs_ci_dentry_operations;
        /* logical blocks are represented by 40 bits in pxd_t, etc. */
        sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
 #if BITS_PER_LONG == 32
@@ -596,11 +591,10 @@ static int jfs_unfreeze(struct super_block *sb)
        return 0;
 }
-static int jfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
-                           mnt);
 }
 static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -783,7 +777,7 @@ static const struct export_operations jfs_export_operations = {
 static struct file_system_type jfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "jfs",
-        .get_sb         = jfs_get_sb,
+        .mount          = jfs_do_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/libfs.c b/fs/libfs.c
index 0a9da95317f7..c88eab55aec9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,6 +16,11 @@
 #include <asm/uaccess.h>
+static inline int simple_positive(struct dentry *dentry)
+{
+        return dentry->d_inode && !d_unhashed(dentry);
+}
 int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
                   struct kstat *stat)
 {
@@ -37,7 +42,7 @@ int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 * Retaining negative dentries for an in-memory filesystem just wastes
 * memory and lookup time: arrange for them to be deleted immediately.
 */
-static int simple_delete_dentry(struct dentry *dentry)
+static int simple_delete_dentry(const struct dentry *dentry)
 {
        return 1;
 }
@@ -54,7 +59,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
-        dentry->d_op = &simple_dentry_operations;
+        d_set_d_op(dentry, &simple_dentry_operations);
        d_add(dentry, NULL);
        return NULL;
 }
@@ -76,7 +81,8 @@ int dcache_dir_close(struct inode *inode, struct file *file)
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 {
-        mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
+        struct dentry *dentry = file->f_path.dentry;
+        mutex_lock(&dentry->d_inode->i_mutex);
        switch (origin) {
                case 1:
                        offset += file->f_pos;
@@ -84,7 +90,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
                        if (offset >= 0)
                                break;
                default:
-                        mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+                        mutex_unlock(&dentry->d_inode->i_mutex);
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
@@ -94,21 +100,24 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
                        struct dentry *cursor = file->private_data;
                        loff_t n = file->f_pos - 2;
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
+                        /* d_lock not required for cursor */
                        list_del(&cursor->d_u.d_child);
-                        p = file->f_path.dentry->d_subdirs.next;
+                        p = dentry->d_subdirs.next;
-                        while (n && p != &file->f_path.dentry->d_subdirs) {
+                        while (n && p != &dentry->d_subdirs) {
                                struct dentry *next;
                                next = list_entry(p, struct dentry, d_u.d_child);
-                                if (!d_unhashed(next) && next->d_inode)
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+                                if (simple_positive(next))
                                        n--;
+                                spin_unlock(&next->d_lock);
                                p = p->next;
                        }
                        list_add_tail(&cursor->d_u.d_child, p);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
                }
        }
-        mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+        mutex_unlock(&dentry->d_inode->i_mutex);
        return offset;
 }
@@ -148,29 +157,35 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        i++;
                        /* fallthrough */
                default:
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
                        if (filp->f_pos == 2)
                                list_move(q, &dentry->d_subdirs);
                        for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
                                struct dentry *next;
                                next = list_entry(p, struct dentry, d_u.d_child);
-                                if (d_unhashed(next) || !next->d_inode)
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+                                if (!simple_positive(next)) {
+                                        spin_unlock(&next->d_lock);
                                        continue;
+                                }
-                                spin_unlock(&dcache_lock);
+                                spin_unlock(&next->d_lock);
+                                spin_unlock(&dentry->d_lock);
                                if (filldir(dirent, next->d_name.name, 
                                            next->d_name.len, filp->f_pos, 
                                            next->d_inode->i_ino, 
                                            dt_type(next->d_inode)) < 0)
                                        return 0;
-                                spin_lock(&dcache_lock);
+                                spin_lock(&dentry->d_lock);
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
                                /* next is still alive */
                                list_move(q, p);
+                                spin_unlock(&next->d_lock);
                                p = q;
                                filp->f_pos++;
                        }
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
        }
        return 0;
 }
@@ -201,9 +216,9 @@ static const struct super_operations simple_super_operations = {
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 */
-int get_sb_pseudo(struct file_system_type *fs_type, char *name,
+struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
-        const struct super_operations *ops, unsigned long magic,
+        const struct super_operations *ops,
-        struct vfsmount *mnt)
+        const struct dentry_operations *dops, unsigned long magic)
 {
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
        struct dentry *dentry;
@@ -211,7 +226,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
        struct qstr d_name = {.name = name, .len = strlen(name)};
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        s->s_flags = MS_NOUSER;
        s->s_maxbytes = MAX_LFS_FILESIZE;
@@ -240,13 +255,13 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
        dentry->d_parent = dentry;
        d_instantiate(dentry, root);
        s->s_root = dentry;
+        s->s_d_op = dops;
        s->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 Enomem:
        deactivate_locked_super(s);
-        return -ENOMEM;
+        return ERR_PTR(-ENOMEM);
 }
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
@@ -255,29 +270,29 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        dget(dentry);
        d_instantiate(dentry, inode);
        return 0;
 }
-static inline int simple_positive(struct dentry *dentry)
-{
-        return dentry->d_inode && !d_unhashed(dentry);
-}
 int simple_empty(struct dentry *dentry)
 {
        struct dentry *child;
        int ret = 0;
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
-        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
+        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
-                if (simple_positive(child))
+                spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+                if (simple_positive(child)) {
+                        spin_unlock(&child->d_lock);
                        goto out;
+                }
+                spin_unlock(&child->d_lock);
+        }
        ret = 1;
 out:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return ret;
 }
@@ -892,10 +907,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 */
 int generic_file_fsync(struct file *file, int datasync)
 {
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0, /* metadata-only; caller takes care of data */
-        };
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;
@@ -906,13 +917,42 @@ int generic_file_fsync(struct file *file, int datasync)
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                return ret;
-        err = sync_inode(inode, &wbc);
+        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;
        return ret;
 }
 EXPORT_SYMBOL(generic_file_fsync);
+/**
+ * generic_check_addressable - Check addressability of file system
+ * @blocksize_bits:     log of file system block size
+ * @num_blocks:         number of blocks in file system
+ *
+ * Determine whether a file system with @num_blocks blocks (and a
+ * block size of 2**@blocksize_bits) is addressable by the sector_t
+ * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
+ */
+int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
+{
+        u64 last_fs_block = num_blocks - 1;
+        u64 last_fs_page =
+                last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
+        if (unlikely(num_blocks == 0))
+                return 0;
+        if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+                return -EINVAL;
+        if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
+            (last_fs_page > (pgoff_t)(~0ULL))) {
+                return -EFBIG;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(generic_check_addressable);
 /*
 * No-op implementation of ->fsync for in-memory filesystems.
 */
@@ -926,7 +966,7 @@ EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
 EXPORT_SYMBOL(dcache_readdir);
 EXPORT_SYMBOL(generic_read_dir);
-EXPORT_SYMBOL(get_sb_pseudo);
+EXPORT_SYMBOL(mount_pseudo);
 EXPORT_SYMBOL(simple_write_begin);
 EXPORT_SYMBOL(simple_write_end);
 EXPORT_SYMBOL(simple_dir_inode_operations);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 97f6073ab339..ca58d64374ca 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_LOCKD) += lockd.o
-lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
+lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
-                svcproc.o svcsubs.o mon.o xdr.o grace.o
+                svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
-lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
 lockd-objs                    := $(lockd-objs-y)
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
new file mode 100644
index 000000000000..f848b52c67b1
--- /dev/null
+++ b/fs/lockd/clnt4xdr.c
@@ -0,0 +1,605 @@
+/*
+ * linux/fs/lockd/clnt4xdr.c
+ *
+ * XDR functions to encode/decode NLM version 4 RPC arguments and results.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+#define NLMDBG_FACILITY         NLMDBG_XDR
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM4_void_sz            (0)
+#define NLM4_cookie_sz          (1+(NLM_MAXCOOKIELEN>>2))
+#define NLM4_caller_sz          (1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_owner_sz           (1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_fhandle_sz         (1+(NFS3_FHSIZE>>2))
+#define NLM4_lock_sz            (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
+#define NLM4_holder_sz          (6+NLM4_owner_sz)
+#define NLM4_testargs_sz        (NLM4_cookie_sz+1+NLM4_lock_sz)
+#define NLM4_lockargs_sz        (NLM4_cookie_sz+4+NLM4_lock_sz)
+#define NLM4_cancargs_sz        (NLM4_cookie_sz+2+NLM4_lock_sz)
+#define NLM4_unlockargs_sz      (NLM4_cookie_sz+NLM4_lock_sz)
+#define NLM4_testres_sz         (NLM4_cookie_sz+1+NLM4_holder_sz)
+#define NLM4_res_sz             (NLM4_cookie_sz+1)
+#define NLM4_norep_sz           (0)
+static s64 loff_t_to_s64(loff_t offset)
+{
+        s64 res;
+        if (offset >= NLM4_OFFSET_MAX)
+                res = NLM4_OFFSET_MAX;
+        else if (offset <= -NLM4_OFFSET_MAX)
+                res = -NLM4_OFFSET_MAX;
+        else
+                res = offset;
+        return res;
+}
+static void nlm4_compute_offsets(const struct nlm_lock *lock,
+                                 u64 *l_offset, u64 *l_len)
+{
+        const struct file_lock *fl = &lock->fl;
+        BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
+        BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
+                                fl->fl_end != OFFSET_MAX);
+        *l_offset = loff_t_to_s64(fl->fl_start);
+        if (fl->fl_end == OFFSET_MAX)
+                *l_len = 0;
+        else
+                *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NLMv4 basic data types
+ *
+ * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
+ * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
+ * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = value ? xdr_one : xdr_zero;
+}
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
+}
+/*
+ *      typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+                          const u8 *data, const unsigned int length)
+{
+        __be32 *p;
+        BUG_ON(length > XDR_MAX_NETOBJ);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, data, length);
+}
+static int decode_netobj(struct xdr_stream *xdr,
+                         struct xdr_netobj *obj)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > XDR_MAX_NETOBJ))
+                goto out_size;
+        obj->len = length;
+        obj->data = (u8 *)p;
+        return 0;
+out_size:
+        dprintk("NFS: returned netobj was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+                          const struct nlm_cookie *cookie)
+{
+        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+static int decode_cookie(struct xdr_stream *xdr,
+                             struct nlm_cookie *cookie)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        /* apparently HPUX can return empty cookies */
+        if (length == 0)
+                goto out_hpux;
+        if (length > NLM_MAXCOOKIELEN)
+                goto out_size;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        cookie->len = length;
+        memcpy(cookie->data, p, length);
+        return 0;
+out_hpux:
+        cookie->len = 4;
+        memset(cookie->data, 0, 4);
+        return 0;
+out_size:
+        dprintk("NFS: returned cookie was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        BUG_ON(fh->size > NFS3_FHSIZE);
+        encode_netobj(xdr, (u8 *)&fh->data, fh->size);
+}
+/*
+ *      enum nlm4_stats {
+ *              NLM4_GRANTED = 0,
+ *              NLM4_DENIED = 1,
+ *              NLM4_DENIED_NOLOCKS = 2,
+ *              NLM4_BLOCKED = 3,
+ *              NLM4_DENIED_GRACE_PERIOD = 4,
+ *              NLM4_DEADLCK = 5,
+ *              NLM4_ROFS = 6,
+ *              NLM4_STALE_FH = 7,
+ *              NLM4_FBIG = 8,
+ *              NLM4_FAILED = 9
+ *      };
+ *
+ *      struct nlm4_stat {
+ *              nlm4_stats stat;
+ *      };
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm4_stat(struct xdr_stream *xdr,
+                             const __be32 stat)
+{
+        __be32 *p;
+        BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
+        p = xdr_reserve_space(xdr, 4);
+        *p = stat;
+}
+static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (unlikely(*p > nlm4_failed))
+                goto out_bad_xdr;
+        *stat = *p;
+        return 0;
+out_bad_xdr:
+        dprintk("%s: server returned invalid nlm4_stats value: %u\n",
+                        __func__, be32_to_cpup(p));
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      struct nlm4_holder {
+ *              bool    exclusive;
+ *              int32   svid;
+ *              netobj  oh;
+ *              uint64  l_offset;
+ *              uint64  l_len;
+ *      };
+ */
+static void encode_nlm4_holder(struct xdr_stream *xdr,
+                               const struct nlm_res *result)
+{
+        const struct nlm_lock *lock = &result->lock;
+        u64 l_offset, l_len;
+        __be32 *p;
+        encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+        encode_int32(xdr, lock->svid);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        nlm4_compute_offsets(lock, &l_offset, &l_len);
+        p = xdr_encode_hyper(p, l_offset);
+        xdr_encode_hyper(p, l_len);
+}
+static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+        struct nlm_lock *lock = &result->lock;
+        struct file_lock *fl = &lock->fl;
+        u64 l_offset, l_len;
+        u32 exclusive;
+        int error;
+        __be32 *p;
+        s32 end;
+        memset(lock, 0, sizeof(*lock));
+        locks_init_lock(fl);
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        exclusive = be32_to_cpup(p++);
+        lock->svid = be32_to_cpup(p);
+        fl->fl_pid = (pid_t)lock->svid;
+        error = decode_netobj(xdr, &lock->oh);
+        if (unlikely(error))
+                goto out;
+        p = xdr_inline_decode(xdr, 8 + 8);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fl->fl_flags = FL_POSIX;
+        fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+        p = xdr_decode_hyper(p, &l_offset);
+        xdr_decode_hyper(p, &l_len);
+        end = l_offset + l_len - 1;
+        fl->fl_start = (loff_t)l_offset;
+        if (l_len == 0 || end < 0)
+                fl->fl_end = OFFSET_MAX;
+        else
+                fl->fl_end = (loff_t)end;
+        error = 0;
+out:
+        return error;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+        /* NB: client-side does not set lock->len */
+        u32 length = strlen(name);
+        __be32 *p;
+        BUG_ON(length > NLM_MAXSTRLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+/*
+ *      struct nlm4_lock {
+ *              string  caller_name<LM_MAXSTRLEN>;
+ *              netobj  fh;
+ *              netobj  oh;
+ *              int32   svid;
+ *              uint64  l_offset;
+ *              uint64  l_len;
+ *      };
+ */
+static void encode_nlm4_lock(struct xdr_stream *xdr,
+                             const struct nlm_lock *lock)
+{
+        u64 l_offset, l_len;
+        __be32 *p;
+        encode_caller_name(xdr, lock->caller);
+        encode_fh(xdr, &lock->fh);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 8 + 8);
+        *p++ = cpu_to_be32(lock->svid);
+        nlm4_compute_offsets(lock, &l_offset, &l_len);
+        p = xdr_encode_hyper(p, l_offset);
+        xdr_encode_hyper(p, l_len);
+}
+/*
+ * NLMv4 XDR encode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      struct nlm4_testargs {
+ *              netobj cookie;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_lockargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *              bool reclaim;
+ *              int state;
+ *      };
+ */
+static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+        encode_bool(xdr, args->reclaim);
+        encode_int32(xdr, args->state);
+}
+/*
+ *      struct nlm4_cancargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_unlockargs {
+ *              netobj cookie;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_res {
+ *              netobj cookie;
+ *              nlm4_stat stat;
+ *      };
+ */
+static void nlm4_xdr_enc_res(struct rpc_rqst *req,
+                             struct xdr_stream *xdr,
+                             const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm4_stat(xdr, result->status);
+}
+/*
+ *      union nlm4_testrply switch (nlm4_stats stat) {
+ *      case NLM4_DENIED:
+ *              struct nlm4_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm4_testres {
+ *              netobj cookie;
+ *              nlm4_testrply test_stat;
+ *      };
+ */
+static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm4_stat(xdr, result->status);
+        if (result->status == nlm_lck_denied)
+                encode_nlm4_holder(xdr, result);
+}
+/*
+ * NLMv4 XDR decode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      union nlm4_testrply switch (nlm4_stats stat) {
+ *      case NLM4_DENIED:
+ *              struct nlm4_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm4_testres {
+ *              netobj cookie;
+ *              nlm4_testrply test_stat;
+ *      };
+ */
+static int decode_nlm4_testrply(struct xdr_stream *xdr,
+                                struct nlm_res *result)
+{
+        int error;
+        error = decode_nlm4_stat(xdr, &result->status);
+        if (unlikely(error))
+                goto out;
+        if (result->status == nlm_lck_denied)
+                error = decode_nlm4_holder(xdr, result);
+out:
+        return error;
+}
+static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm4_testrply(xdr, result);
+out:
+        return error;
+}
+/*
+ *      struct nlm4_res {
+ *              netobj cookie;
+ *              nlm4_stat stat;
+ *      };
+ */
+static int nlm4_xdr_dec_res(struct rpc_rqst *req,
+                            struct xdr_stream *xdr,
+                            struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm4_stat(xdr, &result->status);
+out:
+        return error;
+}
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm4_xdr_dec_norep      NULL
+#define PROC(proc, argtype, restype)                                    \
+[NLMPROC_##proc] = {                                                    \
+        .p_proc      = NLMPROC_##proc,                                  \
+        .p_encode    = (kxdreproc_t)nlm4_xdr_enc_##argtype,             \
+        .p_decode    = (kxdrdproc_t)nlm4_xdr_dec_##restype,             \
+        .p_arglen    = NLM4_##argtype##_sz,                             \
+        .p_replen    = NLM4_##restype##_sz,                             \
+        .p_statidx   = NLMPROC_##proc,                                  \
+        .p_name      = #proc,                                           \
+        }
+static struct rpc_procinfo      nlm4_procedures[] = {
+        PROC(TEST,              testargs,       testres),
+        PROC(LOCK,              lockargs,       res),
+        PROC(CANCEL,            cancargs,       res),
+        PROC(UNLOCK,            unlockargs,     res),
+        PROC(GRANTED,           testargs,       res),
+        PROC(TEST_MSG,          testargs,       norep),
+        PROC(LOCK_MSG,          lockargs,       norep),
+        PROC(CANCEL_MSG,        cancargs,       norep),
+        PROC(UNLOCK_MSG,        unlockargs,     norep),
+        PROC(GRANTED_MSG,       testargs,       norep),
+        PROC(TEST_RES,          testres,        norep),
+        PROC(LOCK_RES,          res,            norep),
+        PROC(CANCEL_RES,        res,            norep),
+        PROC(UNLOCK_RES,        res,            norep),
+        PROC(GRANTED_RES,       res,            norep),
+};
+struct rpc_version      nlm_version4 = {
+        .number         = 4,
+        .nrprocs        = ARRAY_SIZE(nlm4_procedures),
+        .procs          = nlm4_procedures,
+};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 64fd427c993c..8d4ea8351e3d 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,7 +14,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/smp_lock.h>
 #include <linux/kthread.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -42,6 +41,7 @@ struct nlm_wait {
 };
 static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
 /**
 * nlmclnt_init - Set up per-NFS mount point lockd data structures
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
 */
 void nlmclnt_done(struct nlm_host *host)
 {
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        lockd_down();
 }
 EXPORT_SYMBOL_GPL(nlmclnt_done);
@@ -97,7 +97,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *
                block->b_lock = fl;
                init_waitqueue_head(&block->b_wait);
                block->b_status = nlm_lck_blocked;
+                spin_lock(&nlm_blocked_lock);
                list_add(&block->b_list, &nlm_blocked);
+                spin_unlock(&nlm_blocked_lock);
        }
        return block;
 }
@@ -106,7 +109,9 @@ void nlmclnt_finish_block(struct nlm_wait *block)
 {
        if (block == NULL)
                return;
+        spin_lock(&nlm_blocked_lock);
        list_del(&block->b_list);
+        spin_unlock(&nlm_blocked_lock);
        kfree(block);
 }
@@ -154,6 +159,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
         * Look up blocked request based on arguments. 
         * Warning: must not use cookie to match it!
         */
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                struct file_lock *fl_blocked = block->b_lock;
@@ -178,6 +184,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
                wake_up(&block->b_wait);
                res = nlm_granted;
        }
+        spin_unlock(&nlm_blocked_lock);
        return res;
 }
@@ -216,10 +223,6 @@ reclaimer(void *ptr)
        allow_signal(SIGKILL);
        down_write(&host->h_rwsem);
-        /* This one ensures that our parent doesn't terminate while the
-         * reclaim is in progress */
-        lock_kernel();
        lockd_up();     /* note: this cannot fail as lockd is already running */
        dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
@@ -260,16 +263,17 @@ restart:
        dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
        /* Now, wake up all processes that sleep on a blocked lock */
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                if (block->b_host == host) {
                        block->b_status = nlm_lck_denied_grace_period;
                        wake_up(&block->b_wait);
                }
        }
+        spin_unlock(&nlm_blocked_lock);
        /* Release host handle after use */
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        lockd_down();
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7932c399fab4..adb45ec9038c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,7 +7,6 @@
 */
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/errno.h>
@@ -59,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
                return;
        list_del(&lockowner->list);
        spin_unlock(&lockowner->host->h_lock);
-        nlm_release_host(lockowner->host);
+        nlmclnt_release_host(lockowner->host);
        kfree(lockowner);
 }
@@ -166,7 +165,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
        /* Set up the argument struct */
        nlmclnt_setlockargs(call, fl);
-        lock_kernel();
        if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
                if (fl->fl_type != F_UNLCK) {
                        call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -177,10 +175,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
                status = nlmclnt_test(call, fl);
        else
                status = -EINVAL;
        fl->fl_ops->fl_release_private(fl);
        fl->fl_ops = NULL;
-        unlock_kernel();
        dprintk("lockd: clnt proc returns %d\n", status);
        return status;
@@ -211,24 +207,22 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
                printk("nlm_alloc_call: failed, waiting for memory\n");
                schedule_timeout_interruptible(5*HZ);
        }
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        return NULL;
 }
-void nlm_release_call(struct nlm_rqst *call)
+void nlmclnt_release_call(struct nlm_rqst *call)
 {
        if (!atomic_dec_and_test(&call->a_count))
                return;
-        nlm_release_host(call->a_host);
+        nlmclnt_release_host(call->a_host);
        nlmclnt_release_lockargs(call);
        kfree(call);
 }
 static void nlmclnt_rpc_release(void *data)
 {
-        lock_kernel();
+        nlmclnt_release_call(data);
-        nlm_release_call(data);
-        unlock_kernel();
 }
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -442,20 +436,24 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
                        status = nlm_stat_to_errno(req->a_res.status);
        }
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
 static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
 {
+        spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
        new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
        new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
        list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
+        spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 }
 static void nlmclnt_locks_release_private(struct file_lock *fl)
 {
+        spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
        list_del(&fl->fl_u.nfs_fl.list);
+        spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
        nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
 }
@@ -595,7 +593,7 @@ again:
 out_unblock:
        nlmclnt_finish_block(block);
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 out_unlock:
        /* Fatal error: ensure that we remove the lock altogether */
@@ -696,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
        /* What to do now? I'm out of my depth... */
        status = -ENOLCK;
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
@@ -721,9 +719,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 die:
        return;
 retry_rebind:
-        lock_kernel();
        nlm_rebind_host(req->a_host);
-        unlock_kernel();
 retry_unlock:
        rpc_restart_call(task);
 }
@@ -759,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
                        NLMPROC_CANCEL, &nlmclnt_cancel_ops);
        if (status == 0 && req->a_res.status == nlm_lck_denied)
                status = -ENOLCK;
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
@@ -801,9 +797,7 @@ retry_cancel:
        /* Don't ever retry more than 3 times */
        if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
                goto die;
-        lock_kernel();
        nlm_rebind_host(req->a_host);
-        unlock_kernel();
        rpc_restart_call(task);
        rpc_delay(task, 30 * HZ);
 }
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
new file mode 100644
index 000000000000..180ac34feb9a
--- /dev/null
+++ b/fs/lockd/clntxdr.c
@@ -0,0 +1,627 @@
+/*
+ * linux/fs/lockd/clntxdr.c
+ *
+ * XDR functions to encode/decode NLM version 3 RPC arguments and results.
+ * NLM version 3 is backwards compatible with NLM versions 1 and 2.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+#define NLMDBG_FACILITY         NLMDBG_XDR
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM_cookie_sz           (1+(NLM_MAXCOOKIELEN>>2))
+#define NLM_caller_sz           (1+(NLMCLNT_OHSIZE>>2))
+#define NLM_owner_sz            (1+(NLMCLNT_OHSIZE>>2))
+#define NLM_fhandle_sz          (1+(NFS2_FHSIZE>>2))
+#define NLM_lock_sz             (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
+#define NLM_holder_sz           (4+NLM_owner_sz)
+#define NLM_testargs_sz         (NLM_cookie_sz+1+NLM_lock_sz)
+#define NLM_lockargs_sz         (NLM_cookie_sz+4+NLM_lock_sz)
+#define NLM_cancargs_sz         (NLM_cookie_sz+2+NLM_lock_sz)
+#define NLM_unlockargs_sz       (NLM_cookie_sz+NLM_lock_sz)
+#define NLM_testres_sz          (NLM_cookie_sz+1+NLM_holder_sz)
+#define NLM_res_sz              (NLM_cookie_sz+1)
+#define NLM_norep_sz            (0)
+static s32 loff_t_to_s32(loff_t offset)
+{
+        s32 res;
+        if (offset >= NLM_OFFSET_MAX)
+                res = NLM_OFFSET_MAX;
+        else if (offset <= -NLM_OFFSET_MAX)
+                res = -NLM_OFFSET_MAX;
+        else
+                res = offset;
+        return res;
+}
+static void nlm_compute_offsets(const struct nlm_lock *lock,
+                                u32 *l_offset, u32 *l_len)
+{
+        const struct file_lock *fl = &lock->fl;
+        BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
+        BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
+                                fl->fl_end != OFFSET_MAX);
+        *l_offset = loff_t_to_s32(fl->fl_start);
+        if (fl->fl_end == OFFSET_MAX)
+                *l_len = 0;
+        else
+                *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NLMv3 basic data types
+ *
+ * Basic NLMv3 data types are not defined in an IETF standards
+ * document.  X/Open has a description of these data types that
+ * is useful.  See Chapter 10 of "Protocols for Interworking:
+ * XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = value ? xdr_one : xdr_zero;
+}
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
+}
+/*
+ *      typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+                          const u8 *data, const unsigned int length)
+{
+        __be32 *p;
+        BUG_ON(length > XDR_MAX_NETOBJ);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, data, length);
+}
+static int decode_netobj(struct xdr_stream *xdr,
+                         struct xdr_netobj *obj)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > XDR_MAX_NETOBJ))
+                goto out_size;
+        obj->len = length;
+        obj->data = (u8 *)p;
+        return 0;
+out_size:
+        dprintk("NFS: returned netobj was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+                          const struct nlm_cookie *cookie)
+{
+        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+static int decode_cookie(struct xdr_stream *xdr,
+                         struct nlm_cookie *cookie)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        /* apparently HPUX can return empty cookies */
+        if (length == 0)
+                goto out_hpux;
+        if (length > NLM_MAXCOOKIELEN)
+                goto out_size;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        cookie->len = length;
+        memcpy(cookie->data, p, length);
+        return 0;
+out_hpux:
+        cookie->len = 4;
+        memset(cookie->data, 0, 4);
+        return 0;
+out_size:
+        dprintk("NFS: returned cookie was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        BUG_ON(fh->size != NFS2_FHSIZE);
+        encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
+}
+/*
+ *      enum nlm_stats {
+ *              LCK_GRANTED = 0,
+ *              LCK_DENIED = 1,
+ *              LCK_DENIED_NOLOCKS = 2,
+ *              LCK_BLOCKED = 3,
+ *              LCK_DENIED_GRACE_PERIOD = 4
+ *      };
+ *
+ *
+ *      struct nlm_stat {
+ *              nlm_stats stat;
+ *      };
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm_stat(struct xdr_stream *xdr,
+                            const __be32 stat)
+{
+        __be32 *p;
+        BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
+        p = xdr_reserve_space(xdr, 4);
+        *p = stat;
+}
+static int decode_nlm_stat(struct xdr_stream *xdr,
+                           __be32 *stat)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (unlikely(*p > nlm_lck_denied_grace_period))
+                goto out_enum;
+        *stat = *p;
+        return 0;
+out_enum:
+        dprintk("%s: server returned invalid nlm_stats value: %u\n",
+                __func__, be32_to_cpup(p));
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      struct nlm_holder {
+ *              bool exclusive;
+ *              int uppid;
+ *              netobj oh;
+ *              unsigned l_offset;
+ *              unsigned l_len;
+ *      };
+ */
+static void encode_nlm_holder(struct xdr_stream *xdr,
+                              const struct nlm_res *result)
+{
+        const struct nlm_lock *lock = &result->lock;
+        u32 l_offset, l_len;
+        __be32 *p;
+        encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+        encode_int32(xdr, lock->svid);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        nlm_compute_offsets(lock, &l_offset, &l_len);
+        *p++ = cpu_to_be32(l_offset);
+        *p   = cpu_to_be32(l_len);
+}
+static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+        struct nlm_lock *lock = &result->lock;
+        struct file_lock *fl = &lock->fl;
+        u32 exclusive, l_offset, l_len;
+        int error;
+        __be32 *p;
+        s32 end;
+        memset(lock, 0, sizeof(*lock));
+        locks_init_lock(fl);
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        exclusive = be32_to_cpup(p++);
+        lock->svid = be32_to_cpup(p);
+        fl->fl_pid = (pid_t)lock->svid;
+        error = decode_netobj(xdr, &lock->oh);
+        if (unlikely(error))
+                goto out;
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fl->fl_flags = FL_POSIX;
+        fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+        l_offset = be32_to_cpup(p++);
+        l_len = be32_to_cpup(p);
+        end = l_offset + l_len - 1;
+        fl->fl_start = (loff_t)l_offset;
+        if (l_len == 0 || end < 0)
+                fl->fl_end = OFFSET_MAX;
+        else
+                fl->fl_end = (loff_t)end;
+        error = 0;
+out:
+        return error;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+        /* NB: client-side does not set lock->len */
+        u32 length = strlen(name);
+        __be32 *p;
+        BUG_ON(length > NLM_MAXSTRLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+/*
+ *      struct nlm_lock {
+ *              string caller_name<LM_MAXSTRLEN>;
+ *              netobj fh;
+ *              netobj oh;
+ *              int uppid;
+ *              unsigned l_offset;
+ *              unsigned l_len;
+ *      };
+ */
+static void encode_nlm_lock(struct xdr_stream *xdr,
+                            const struct nlm_lock *lock)
+{
+        u32 l_offset, l_len;
+        __be32 *p;
+        encode_caller_name(xdr, lock->caller);
+        encode_fh(xdr, &lock->fh);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
+        *p++ = cpu_to_be32(lock->svid);
+        nlm_compute_offsets(lock, &l_offset, &l_len);
+        *p++ = cpu_to_be32(l_offset);
+        *p   = cpu_to_be32(l_len);
+}
+/*
+ * NLMv3 XDR encode functions
+ *
+ * NLMv3 argument types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      struct nlm_testargs {
+ *              netobj cookie;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_lockargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *              bool reclaim;
+ *              int state;
+ *      };
+ */
+static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+        encode_bool(xdr, args->reclaim);
+        encode_int32(xdr, args->state);
+}
+/*
+ *      struct nlm_cancargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_unlockargs {
+ *              netobj cookie;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_res {
+ *              netobj cookie;
+ *              nlm_stat stat;
+ *      };
+ */
+static void nlm_xdr_enc_res(struct rpc_rqst *req,
+                            struct xdr_stream *xdr,
+                            const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm_stat(xdr, result->status);
+}
+/*
+ *      union nlm_testrply switch (nlm_stats stat) {
+ *      case LCK_DENIED:
+ *              struct nlm_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm_testres {
+ *              netobj cookie;
+ *              nlm_testrply test_stat;
+ *      };
+ */
+static void encode_nlm_testrply(struct xdr_stream *xdr,
+                                const struct nlm_res *result)
+{
+        if (result->status == nlm_lck_denied)
+                encode_nlm_holder(xdr, result);
+}
+static void nlm_xdr_enc_testres(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm_stat(xdr, result->status);
+        encode_nlm_testrply(xdr, result);
+}
+/*
+ * NLMv3 XDR decode functions
+ *
+ * NLMv3 result types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      union nlm_testrply switch (nlm_stats stat) {
+ *      case LCK_DENIED:
+ *              struct nlm_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm_testres {
+ *              netobj cookie;
+ *              nlm_testrply test_stat;
+ *      };
+ */
+static int decode_nlm_testrply(struct xdr_stream *xdr,
+                               struct nlm_res *result)
+{
+        int error;
+        error = decode_nlm_stat(xdr, &result->status);
+        if (unlikely(error))
+                goto out;
+        if (result->status == nlm_lck_denied)
+                error = decode_nlm_holder(xdr, result);
+out:
+        return error;
+}
+static int nlm_xdr_dec_testres(struct rpc_rqst *req,
+                               struct xdr_stream *xdr,
+                               struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm_testrply(xdr, result);
+out:
+        return error;
+}
+/*
+ *      struct nlm_res {
+ *              netobj cookie;
+ *              nlm_stat stat;
+ *      };
+ */
+static int nlm_xdr_dec_res(struct rpc_rqst *req,
+                           struct xdr_stream *xdr,
+                           struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm_stat(xdr, &result->status);
+out:
+        return error;
+}
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm_xdr_dec_norep       NULL
+#define PROC(proc, argtype, restype)    \
+[NLMPROC_##proc] = {                                                    \
+        .p_proc      = NLMPROC_##proc,                                  \
+        .p_encode    = (kxdreproc_t)nlm_xdr_enc_##argtype,              \
+        .p_decode    = (kxdrdproc_t)nlm_xdr_dec_##restype,              \
+        .p_arglen    = NLM_##argtype##_sz,                              \
+        .p_replen    = NLM_##restype##_sz,                              \
+        .p_statidx   = NLMPROC_##proc,                                  \
+        .p_name      = #proc,                                           \
+        }
+static struct rpc_procinfo      nlm_procedures[] = {
+        PROC(TEST,              testargs,       testres),
+        PROC(LOCK,              lockargs,       res),
+        PROC(CANCEL,            cancargs,       res),
+        PROC(UNLOCK,            unlockargs,     res),
+        PROC(GRANTED,           testargs,       res),
+        PROC(TEST_MSG,          testargs,       norep),
+        PROC(LOCK_MSG,          lockargs,       norep),
+        PROC(CANCEL_MSG,        cancargs,       norep),
+        PROC(UNLOCK_MSG,        unlockargs,     norep),
+        PROC(GRANTED_MSG,       testargs,       norep),
+        PROC(TEST_RES,          testres,        norep),
+        PROC(LOCK_RES,          res,            norep),
+        PROC(CANCEL_RES,        res,            norep),
+        PROC(UNLOCK_RES,        res,            norep),
+        PROC(GRANTED_RES,       res,            norep),
+};
+static struct rpc_version       nlm_version1 = {
+                .number         = 1,
+                .nrprocs        = ARRAY_SIZE(nlm_procedures),
+                .procs          = nlm_procedures,
+};
+static struct rpc_version       nlm_version3 = {
+                .number         = 3,
+                .nrprocs        = ARRAY_SIZE(nlm_procedures),
+                .procs          = nlm_procedures,
+};
+static struct rpc_version       *nlm_versions[] = {
+        [1] = &nlm_version1,
+        [3] = &nlm_version3,
+#ifdef CONFIG_LOCKD_V4
+        [4] = &nlm_version4,
+#endif
+};
+static struct rpc_stat          nlm_rpc_stats;
+struct rpc_program              nlm_program = {
+                .name           = "lockd",
+                .number         = NLM_PROGRAM,
+                .nrvers         = ARRAY_SIZE(nlm_versions),
+                .version        = nlm_versions,
+                .stats          = &nlm_rpc_stats,
+};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index bb464d12104c..b7c99bfb3da6 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -25,9 +25,22 @@
 #define NLM_HOST_EXPIRE         (300 * HZ)
 #define NLM_HOST_COLLECT        (120 * HZ)
-static struct hlist_head        nlm_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_server_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_client_hosts[NLM_HOST_NRHASH];
+#define for_each_host(host, pos, chain, table) \
+        for ((chain) = (table); \
+             (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+                hlist_for_each_entry((host), (pos), (chain), h_hash)
+#define for_each_host_safe(host, pos, next, chain, table) \
+        for ((chain) = (table); \
+             (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+                hlist_for_each_entry_safe((host), (pos), (next), \
+                                                (chain), h_hash)
 static unsigned long            next_gc;
-static int                      nrhosts;
+static unsigned long            nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
@@ -40,8 +53,6 @@ struct nlm_lookup_host_info {
        const u32               version;        /* NLM version to search for */
        const char              *hostname;      /* remote's hostname */
        const size_t            hostname_len;   /* it's length */
-        const struct sockaddr   *src_sap;       /* our address (optional) */
-        const size_t            src_len;        /* it's length */
        const int               noresvport;     /* use non-priv port */
 };
@@ -88,126 +99,83 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
 }
 /*
- * Common host lookup routine for server & client
+ * Allocate and initialize an nlm_host.  Common to both client and server.
 */
-static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
+static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
+                                       struct nsm_handle *nsm)
 {
-        struct hlist_head *chain;
+        struct nlm_host *host = NULL;
-        struct hlist_node *pos;
+        unsigned long now = jiffies;
-        struct nlm_host *host;
-        struct nsm_handle *nsm = NULL;
-        mutex_lock(&nlm_host_mutex);
-        if (time_after_eq(jiffies, next_gc))
-                nlm_gc_hosts();
-        /* We may keep several nlm_host objects for a peer, because each
-         * nlm_host is identified by
-         * (address, protocol, version, server/client)
-         * We could probably simplify this a little by putting all those
-         * different NLM rpc_clients into one single nlm_host object.
-         * This would allow us to have one nlm_host per address.
-         */
-        chain = &nlm_hosts[nlm_hash_address(ni->sap)];
-        hlist_for_each_entry(host, pos, chain, h_hash) {
-                if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
-                        continue;
-                /* See if we have an NSM handle for this client */
-                if (!nsm)
-                        nsm = host->h_nsmhandle;
-                if (host->h_proto != ni->protocol)
-                        continue;
-                if (host->h_version != ni->version)
-                        continue;
-                if (host->h_server != ni->server)
-                        continue;
-                if (ni->server &&
-                    !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
-                        continue;
-                /* Move to head of hash chain. */
-                hlist_del(&host->h_hash);
-                hlist_add_head(&host->h_hash, chain);
-                nlm_get_host(host);
-                dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
-                                host->h_name, host->h_addrbuf);
-                goto out;
-        }
-        /*
+        if (nsm != NULL)
-         * The host wasn't in our hash table.  If we don't
-         * have an NSM handle for it yet, create one.
-         */
-        if (nsm)
                atomic_inc(&nsm->sm_count);
        else {
                host = NULL;
                nsm = nsm_get_handle(ni->sap, ni->salen,
                                        ni->hostname, ni->hostname_len);
-                if (!nsm) {
+                if (unlikely(nsm == NULL)) {
-                        dprintk("lockd: nlm_lookup_host failed; "
+                        dprintk("lockd: %s failed; no nsm handle\n",
-                                "no nsm handle\n");
+                                __func__);
                        goto out;
                }
        }
-        host = kzalloc(sizeof(*host), GFP_KERNEL);
+        host = kmalloc(sizeof(*host), GFP_KERNEL);
-        if (!host) {
+        if (unlikely(host == NULL)) {
+                dprintk("lockd: %s failed; no memory\n", __func__);
                nsm_release(nsm);
-                dprintk("lockd: nlm_lookup_host failed; no memory\n");
                goto out;
        }
-        host->h_name       = nsm->sm_name;
-        host->h_addrbuf    = nsm->sm_addrbuf;
        memcpy(nlm_addr(host), ni->sap, ni->salen);
-        host->h_addrlen = ni->salen;
+        host->h_addrlen    = ni->salen;
        rpc_set_port(nlm_addr(host), 0);
-        memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+        host->h_srcaddrlen = 0;
+        host->h_rpcclnt    = NULL;
+        host->h_name       = nsm->sm_name;
        host->h_version    = ni->version;
        host->h_proto      = ni->protocol;
-        host->h_rpcclnt    = NULL;
+        host->h_reclaiming = 0;
-        mutex_init(&host->h_mutex);
+        host->h_server     = ni->server;
-        host->h_nextrebind = jiffies + NLM_HOST_REBIND;
+        host->h_noresvport = ni->noresvport;
-        host->h_expires    = jiffies + NLM_HOST_EXPIRE;
+        host->h_inuse      = 0;
-        atomic_set(&host->h_count, 1);
        init_waitqueue_head(&host->h_gracewait);
        init_rwsem(&host->h_rwsem);
-        host->h_state      = 0;                 /* pseudo NSM state */
+        host->h_state      = 0;
-        host->h_nsmstate   = 0;                 /* real NSM state */
+        host->h_nsmstate   = 0;
-        host->h_nsmhandle  = nsm;
+        host->h_pidcount   = 0;
-        host->h_server     = ni->server;
+        atomic_set(&host->h_count, 1);
-        host->h_noresvport = ni->noresvport;
+        mutex_init(&host->h_mutex);
-        hlist_add_head(&host->h_hash, chain);
+        host->h_nextrebind = now + NLM_HOST_REBIND;
+        host->h_expires    = now + NLM_HOST_EXPIRE;
        INIT_LIST_HEAD(&host->h_lockowners);
        spin_lock_init(&host->h_lock);
        INIT_LIST_HEAD(&host->h_granted);
        INIT_LIST_HEAD(&host->h_reclaim);
+        host->h_nsmhandle  = nsm;
-        nrhosts++;
+        host->h_addrbuf    = nsm->sm_addrbuf;
-        dprintk("lockd: nlm_lookup_host created host %s\n",
-                        host->h_name);
 out:
-        mutex_unlock(&nlm_host_mutex);
        return host;
 }
 /*
- * Destroy a host
+ * Destroy an nlm_host and free associated resources
+ *
+ * Caller must hold nlm_host_mutex.
 */
-static void
+static void nlm_destroy_host_locked(struct nlm_host *host)
-nlm_destroy_host(struct nlm_host *host)
 {
        struct rpc_clnt *clnt;
+        dprintk("lockd: destroy host %s\n", host->h_name);
        BUG_ON(!list_empty(&host->h_lockowners));
        BUG_ON(atomic_read(&host->h_count));
+        hlist_del_init(&host->h_hash);
        nsm_unmonitor(host);
        nsm_release(host->h_nsmhandle);
@@ -215,6 +183,8 @@ nlm_destroy_host(struct nlm_host *host)
        if (clnt != NULL)
                rpc_shutdown_client(clnt);
        kfree(host);
+        nrhosts--;
 }
 /**
@@ -238,9 +208,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                                     const char *hostname,
                                     int noresvport)
 {
-        const struct sockaddr source = {
-                .sa_family      = AF_UNSPEC,
-        };
        struct nlm_lookup_host_info ni = {
                .server         = 0,
                .sap            = sap,
@@ -249,16 +216,78 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .version        = version,
                .hostname       = hostname,
                .hostname_len   = strlen(hostname),
-                .src_sap        = &source,
-                .src_len        = sizeof(source),
                .noresvport     = noresvport,
        };
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host;
+        struct nsm_handle *nsm = NULL;
        dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
                        (hostname ? hostname : "<none>"), version,
                        (protocol == IPPROTO_UDP ? "udp" : "tcp"));
-        return nlm_lookup_host(&ni);
+        mutex_lock(&nlm_host_mutex);
+        chain = &nlm_client_hosts[nlm_hash_address(sap)];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!rpc_cmp_addr(nlm_addr(host), sap))
+                        continue;
+                /* Same address. Share an NSM handle if we already have one */
+                if (nsm == NULL)
+                        nsm = host->h_nsmhandle;
+                if (host->h_proto != protocol)
+                        continue;
+                if (host->h_version != version)
+                        continue;
+                nlm_get_host(host);
+                dprintk("lockd: %s found host %s (%s)\n", __func__,
+                        host->h_name, host->h_addrbuf);
+                goto out;
+        }
+        host = nlm_alloc_host(&ni, nsm);
+        if (unlikely(host == NULL))
+                goto out;
+        hlist_add_head(&host->h_hash, chain);
+        nrhosts++;
+        dprintk("lockd: %s created host %s (%s)\n", __func__,
+                host->h_name, host->h_addrbuf);
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
+}
+/**
+ * nlmclnt_release_host - release client nlm_host
+ * @host: nlm_host to release
+ *
+ */
+void nlmclnt_release_host(struct nlm_host *host)
+{
+        if (host == NULL)
+                return;
+        dprintk("lockd: release client host %s\n", host->h_name);
+        BUG_ON(atomic_read(&host->h_count) < 0);
+        BUG_ON(host->h_server);
+        if (atomic_dec_and_test(&host->h_count)) {
+                BUG_ON(!list_empty(&host->h_lockowners));
+                BUG_ON(!list_empty(&host->h_granted));
+                BUG_ON(!list_empty(&host->h_reclaim));
+                mutex_lock(&nlm_host_mutex);
+                nlm_destroy_host_locked(host);
+                mutex_unlock(&nlm_host_mutex);
+        }
 }
 /**
@@ -283,12 +312,18 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                                    const char *hostname,
                                    const size_t hostname_len)
 {
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host = NULL;
+        struct nsm_handle *nsm = NULL;
        struct sockaddr_in sin = {
                .sin_family     = AF_INET,
        };
        struct sockaddr_in6 sin6 = {
                .sin6_family    = AF_INET6,
        };
+        struct sockaddr *src_sap;
+        size_t src_len = rqstp->rq_addrlen;
        struct nlm_lookup_host_info ni = {
                .server         = 1,
                .sap            = svc_addr(rqstp),
@@ -297,27 +332,91 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                .version        = rqstp->rq_vers,
                .hostname       = hostname,
                .hostname_len   = hostname_len,
-                .src_len        = rqstp->rq_addrlen,
        };
        dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
                        (int)hostname_len, hostname, rqstp->rq_vers,
                        (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+        mutex_lock(&nlm_host_mutex);
        switch (ni.sap->sa_family) {
        case AF_INET:
                sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
-                ni.src_sap = (struct sockaddr *)&sin;
+                src_sap = (struct sockaddr *)&sin;
                break;
        case AF_INET6:
                ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
-                ni.src_sap = (struct sockaddr *)&sin6;
+                src_sap = (struct sockaddr *)&sin6;
                break;
        default:
-                return NULL;
+                dprintk("lockd: %s failed; unrecognized address family\n",
+                        __func__);
+                goto out;
        }
-        return nlm_lookup_host(&ni);
+        if (time_after_eq(jiffies, next_gc))
+                nlm_gc_hosts();
+        chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
+                        continue;
+                /* Same address. Share an NSM handle if we already have one */
+                if (nsm == NULL)
+                        nsm = host->h_nsmhandle;
+                if (host->h_proto != ni.protocol)
+                        continue;
+                if (host->h_version != ni.version)
+                        continue;
+                if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
+                        continue;
+                /* Move to head of hash chain. */
+                hlist_del(&host->h_hash);
+                hlist_add_head(&host->h_hash, chain);
+                nlm_get_host(host);
+                dprintk("lockd: %s found host %s (%s)\n",
+                        __func__, host->h_name, host->h_addrbuf);
+                goto out;
+        }
+        host = nlm_alloc_host(&ni, nsm);
+        if (unlikely(host == NULL))
+                goto out;
+        memcpy(nlm_srcaddr(host), src_sap, src_len);
+        host->h_srcaddrlen = src_len;
+        hlist_add_head(&host->h_hash, chain);
+        nrhosts++;
+        dprintk("lockd: %s created host %s (%s)\n",
+                __func__, host->h_name, host->h_addrbuf);
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
+}
+/**
+ * nlmsvc_release_host - release server nlm_host
+ * @host: nlm_host to release
+ *
+ * Host is destroyed later in nlm_gc_host().
+ */
+void nlmsvc_release_host(struct nlm_host *host)
+{
+        if (host == NULL)
+                return;
+        dprintk("lockd: release server host %s\n", host->h_name);
+        BUG_ON(atomic_read(&host->h_count) < 0);
+        BUG_ON(!host->h_server);
+        atomic_dec(&host->h_count);
 }
 /*
@@ -353,10 +452,10 @@ nlm_bind_host(struct nlm_host *host)
                        .to_retries     = 5U,
                };
                struct rpc_create_args args = {
+                        .net            = &init_net,
                        .protocol       = host->h_proto,
                        .address        = nlm_addr(host),
                        .addrsize       = host->h_addrlen,
-                        .saddress       = nlm_srcaddr(host),
                        .timeout        = &timeparms,
                        .servername     = host->h_name,
                        .program        = &nlm_program,
@@ -375,6 +474,8 @@ nlm_bind_host(struct nlm_host *host)
                        args.flags |= RPC_CLNT_CREATE_HARDRTRY;
                if (host->h_noresvport)
                        args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+                if (host->h_srcaddrlen)
+                        args.saddress = nlm_srcaddr(host);
                clnt = rpc_create(&args);
                if (!IS_ERR(clnt))
@@ -415,20 +516,29 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
        return host;
 }
-/*
+static struct nlm_host *next_host_state(struct hlist_head *cache,
- * Release NLM host after use
+                                        struct nsm_handle *nsm,
- */
+                                        const struct nlm_reboot *info)
-void nlm_release_host(struct nlm_host *host)
 {
-        if (host != NULL) {
+        struct nlm_host *host;
-                dprintk("lockd: release host %s\n", host->h_name);
+        struct hlist_head *chain;
-                BUG_ON(atomic_read(&host->h_count) < 0);
+        struct hlist_node *pos;
-                if (atomic_dec_and_test(&host->h_count)) {
-                        BUG_ON(!list_empty(&host->h_lockowners));
+        mutex_lock(&nlm_host_mutex);
-                        BUG_ON(!list_empty(&host->h_granted));
+        for_each_host(host, pos, chain, cache) {
-                        BUG_ON(!list_empty(&host->h_reclaim));
+                if (host->h_nsmhandle == nsm
+                    && host->h_nsmstate != info->state) {
+                        host->h_nsmstate = info->state;
+                        host->h_state++;
+                        nlm_get_host(host);
+                        mutex_unlock(&nlm_host_mutex);
+                        return host;
                }
        }
+        mutex_unlock(&nlm_host_mutex);
+        return NULL;
 }
 /**
@@ -440,8 +550,6 @@ void nlm_release_host(struct nlm_host *host)
 */
 void nlm_host_rebooted(const struct nlm_reboot *info)
 {
-        struct hlist_head *chain;
-        struct hlist_node *pos;
        struct nsm_handle *nsm;
        struct nlm_host *host;
@@ -454,32 +562,15 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
         * lock for this.
         * To avoid processing a host several times, we match the nsmstate.
         */
-again:  mutex_lock(&nlm_host_mutex);
+        while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                nlmsvc_free_host_resources(host);
-                hlist_for_each_entry(host, pos, chain, h_hash) {
+                nlmsvc_release_host(host);
-                        if (host->h_nsmhandle == nsm
-                         && host->h_nsmstate != info->state) {
-                                host->h_nsmstate = info->state;
-                                host->h_state++;
-                                nlm_get_host(host);
-                                mutex_unlock(&nlm_host_mutex);
-                                if (host->h_server) {
-                                        /* We're server for this guy, just ditch
-                                         * all the locks he held. */
-                                        nlmsvc_free_host_resources(host);
-                                } else {
-                                        /* He's the server, initiate lock recovery. */
-                                        nlmclnt_recovery(host);
-                                }
-                                nlm_release_host(host);
-                                goto again;
-                        }
-                }
        }
-        mutex_unlock(&nlm_host_mutex);
+        while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
+                nlmclnt_recovery(host);
+                nlmclnt_release_host(host);
+        }
        nsm_release(nsm);
 }
@@ -499,13 +590,11 @@ nlm_shutdown_hosts(void)
        /* First, make all hosts eligible for gc */
        dprintk("lockd: nuking all hosts...\n");
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host(host, pos, chain, nlm_server_hosts) {
-                hlist_for_each_entry(host, pos, chain, h_hash) {
+                host->h_expires = jiffies - 1;
-                        host->h_expires = jiffies - 1;
+                if (host->h_rpcclnt) {
-                        if (host->h_rpcclnt) {
+                        rpc_shutdown_client(host->h_rpcclnt);
-                                rpc_shutdown_client(host->h_rpcclnt);
+                        host->h_rpcclnt = NULL;
-                                host->h_rpcclnt = NULL;
-                        }
                }
        }
@@ -514,15 +603,13 @@ nlm_shutdown_hosts(void)
        mutex_unlock(&nlm_host_mutex);
        /* complain if any hosts are left */
-        if (nrhosts) {
+        if (nrhosts != 0) {
                printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
-                dprintk("lockd: %d hosts left:\n", nrhosts);
+                dprintk("lockd: %lu hosts left:\n", nrhosts);
-                for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                for_each_host(host, pos, chain, nlm_server_hosts) {
-                        hlist_for_each_entry(host, pos, chain, h_hash) {
+                        dprintk("       %s (cnt %d use %d exp %ld)\n",
-                                dprintk("       %s (cnt %d use %d exp %ld)\n",
+                                host->h_name, atomic_read(&host->h_count),
-                                        host->h_name, atomic_read(&host->h_count),
+                                host->h_inuse, host->h_expires);
-                                        host->h_inuse, host->h_expires);
-                        }
                }
        }
 }
@@ -540,29 +627,22 @@ nlm_gc_hosts(void)
        struct nlm_host *host;
        dprintk("lockd: host garbage collection\n");
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host(host, pos, chain, nlm_server_hosts)
-                hlist_for_each_entry(host, pos, chain, h_hash)
+                host->h_inuse = 0;
-                        host->h_inuse = 0;
-        }
        /* Mark all hosts that hold locks, blocks or shares */
        nlmsvc_mark_resources();
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
-                hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
+                if (atomic_read(&host->h_count) || host->h_inuse
-                        if (atomic_read(&host->h_count) || host->h_inuse
+                 || time_before(jiffies, host->h_expires)) {
-                         || time_before(jiffies, host->h_expires)) {
+                        dprintk("nlm_gc_hosts skipping %s "
-                                dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
+                                "(cnt %d use %d exp %ld)\n",
-                                        host->h_name, atomic_read(&host->h_count),
+                                host->h_name, atomic_read(&host->h_count),
-                                        host->h_inuse, host->h_expires);
+                                host->h_inuse, host->h_expires);
-                                continue;
+                        continue;
-                        }
-                        dprintk("lockd: delete host %s\n", host->h_name);
-                        hlist_del_init(&host->h_hash);
-                        nlm_destroy_host(host);
-                        nrhosts--;
                }
+                nlm_destroy_host_locked(host);
        }
        next_gc = jiffies + NLM_HOST_COLLECT;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e3015464fbab..23d7451b2938 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -69,6 +69,7 @@ static struct rpc_clnt *nsm_create(void)
                .sin_addr.s_addr        = htonl(INADDR_LOOPBACK),
        };
        struct rpc_create_args args = {
+                .net                    = &init_net,
                .protocol               = XPRT_TRANSPORT_UDP,
                .address                = (struct sockaddr *)&sin,
                .addrsize               = sizeof(sin),
@@ -400,26 +401,22 @@ void nsm_release(struct nsm_handle *nsm)
 * Status Monitor wire protocol.
 */
-static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
+static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
        const u32 len = strlen(string);
        __be32 *p;
-        if (unlikely(len > SM_MAXSTRLEN))
+        BUG_ON(len > SM_MAXSTRLEN);
-                return -EIO;
+        p = xdr_reserve_space(xdr, 4 + len);
-        p = xdr_reserve_space(xdr, sizeof(u32) + len);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque(p, string, len);
-        return 0;
 }
 /*
 * "mon_name" specifies the host to be monitored.
 */
-static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        return encode_nsm_string(xdr, argp->mon_name);
+        encode_nsm_string(xdr, argp->mon_name);
 }
 /*
@@ -428,35 +425,25 @@ static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
 * has changed.
 */
-static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        int status;
        __be32 *p;
-        status = encode_nsm_string(xdr, utsname()->nodename);
+        encode_nsm_string(xdr, utsname()->nodename);
-        if (unlikely(status != 0))
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
-                return status;
+        *p++ = cpu_to_be32(argp->prog);
-        p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+        *p++ = cpu_to_be32(argp->vers);
-        if (unlikely(p == NULL))
+        *p = cpu_to_be32(argp->proc);
-                return -EIO;
-        *p++ = htonl(argp->prog);
-        *p++ = htonl(argp->vers);
-        *p++ = htonl(argp->proc);
-        return 0;
 }
 /*
 * The "mon_id" argument specifies the non-private arguments
 * of an NSMPROC_MON or NSMPROC_UNMON call.
 */
-static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        int status;
+        encode_mon_name(xdr, argp);
+        encode_my_id(xdr, argp);
-        status = encode_mon_name(xdr, argp);
-        if (unlikely(status != 0))
-                return status;
-        return encode_my_id(xdr, argp);
 }
 /*
@@ -464,68 +451,56 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 * by the NSMPROC_MON call. This information will be supplied in the
 * NLMPROC_SM_NOTIFY call.
 */
-static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
        __be32 *p;
        p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
-        return 0;
 }
-static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
+static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
-                       const struct nsm_args *argp)
+                            const struct nsm_args *argp)
 {
-        struct xdr_stream xdr;
+        encode_mon_id(xdr, argp);
-        int status;
+        encode_priv(xdr, argp);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        status = encode_mon_id(&xdr, argp);
-        if (unlikely(status))
-                return status;
-        return encode_priv(&xdr, argp);
 }
-static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
+static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
-                         const struct nsm_args *argp)
+                              const struct nsm_args *argp)
 {
-        struct xdr_stream xdr;
+        encode_mon_id(xdr, argp);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        return encode_mon_id(&xdr, argp);
 }
-static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
+static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
-                            struct nsm_res *resp)
+                                struct xdr_stream *xdr,
+                                struct nsm_res *resp)
 {
-        struct xdr_stream xdr;
+        __be32 *p;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(xdr, 4 + 4);
-        p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
        if (unlikely(p == NULL))
                return -EIO;
-        resp->status = ntohl(*p++);
+        resp->status = be32_to_cpup(p++);
-        resp->state = ntohl(*p);
+        resp->state = be32_to_cpup(p);
-        dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
+        dprintk("lockd: %s status %d state %d\n",
-                        resp->status, resp->state);
+                __func__, resp->status, resp->state);
        return 0;
 }
-static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
+static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
-                        struct nsm_res *resp)
+                            struct xdr_stream *xdr,
+                            struct nsm_res *resp)
 {
-        struct xdr_stream xdr;
+        __be32 *p;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(xdr, 4);
-        p = xdr_inline_decode(&xdr, sizeof(u32));
        if (unlikely(p == NULL))
                return -EIO;
-        resp->state = ntohl(*p);
+        resp->state = be32_to_cpup(p);
-        dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
+        dprintk("lockd: %s state %d\n", __func__, resp->state);
        return 0;
 }
@@ -541,8 +516,8 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
 static struct rpc_procinfo      nsm_procedures[] = {
 [NSMPROC_MON] = {
                .p_proc         = NSMPROC_MON,
-                .p_encode       = (kxdrproc_t)xdr_enc_mon,
+                .p_encode       = (kxdreproc_t)nsm_xdr_enc_mon,
-                .p_decode       = (kxdrproc_t)xdr_dec_stat_res,
+                .p_decode       = (kxdrdproc_t)nsm_xdr_dec_stat_res,
                .p_arglen       = SM_mon_sz,
                .p_replen       = SM_monres_sz,
                .p_statidx      = NSMPROC_MON,
@@ -550,8 +525,8 @@ static struct rpc_procinfo	nsm_procedures[] = {
        },
 [NSMPROC_UNMON] = {
                .p_proc         = NSMPROC_UNMON,
-                .p_encode       = (kxdrproc_t)xdr_enc_unmon,
+                .p_encode       = (kxdreproc_t)nsm_xdr_enc_unmon,
-                .p_decode       = (kxdrproc_t)xdr_dec_stat,
+                .p_decode       = (kxdrdproc_t)nsm_xdr_dec_stat,
                .p_arglen       = SM_mon_id_sz,
                .p_replen       = SM_unmonres_sz,
                .p_statidx      = NSMPROC_UNMON,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index f1bacf1a0391..abfff9d7979d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -22,7 +22,6 @@
 #include <linux/in.h>
 #include <linux/uio.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -130,15 +129,6 @@ lockd(void *vrqstp)
        dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
-        /*
-         * FIXME: it would be nice if lockd didn't spend its entire life
-         * running under the BKL. At the very least, it would be good to
-         * have someone clarify what it's intended to protect here. I've
-         * seen some handwavy posts about posix locking needing to be
-         * done under the BKL, but it's far from clear.
-         */
-        lock_kernel();
        if (!nlm_timeout)
                nlm_timeout = LOCKD_DFLT_TIMEO;
        nlmsvc_timeout = nlm_timeout * HZ;
@@ -195,7 +185,6 @@ lockd(void *vrqstp)
        if (nlmsvc_ops)
                nlmsvc_invalidate_all();
        nlm_shutdown_hosts();
-        unlock_kernel();
        return 0;
 }
@@ -206,7 +195,7 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
        xprt = svc_find_xprt(serv, name, family, 0);
        if (xprt == NULL)
-                return svc_create_xprt(serv, name, family, port,
+                return svc_create_xprt(serv, name, &init_net, family, port,
                                                SVC_SOCK_DEFAULTS);
        svc_xprt_put(xprt);
        return 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 031c6569a134..9a41fdc19511 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
@@ -52,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 no_locks:
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        if (error)
                return error;   
        return nlm_lck_denied_nolocks;
@@ -93,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -135,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -165,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
        dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -198,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_unlock(file, &argp->lock);
        dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -230,9 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 static void nlm4svc_callback_release(void *data)
 {
-        lock_kernel();
+        nlmsvc_release_call(data);
-        nlm_release_call(data);
-        unlock_kernel();
 }
 static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -264,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
        stat = func(rqstp, argp, &call->a_res);
        if (stat != 0) {
-                nlm_release_call(call);
+                nlmsvc_release_call(call);
                return stat;
        }
@@ -337,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_share_file(host, file, argp);
        dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -370,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_unshare_file(host, file, argp);
        dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -402,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                return rpc_success;
        nlmsvc_free_host_resources(host);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        return rpc_success;
 }
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 84055d31bfc5..6e31695d046f 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -25,7 +25,6 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/nlm.h>
@@ -47,17 +46,19 @@ static void	nlmsvc_remove_block(struct nlm_block *block);
 static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
 static void nlmsvc_freegrantargs(struct nlm_rqst *call);
 static const struct rpc_call_ops nlmsvc_grant_ops;
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
 /*
 * The list of blocked locks to retry
 */
 static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
 /*
 * Insert a blocked lock into the global list
 */
 static void
-nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
 {
        struct nlm_block *b;
        struct list_head *pos;
@@ -87,6 +88,13 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
        block->b_when = when;
 }
+static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+{
+        spin_lock(&nlm_blocked_lock);
+        nlmsvc_insert_block_locked(block, when);
+        spin_unlock(&nlm_blocked_lock);
+}
 /*
 * Remove a block from the global list
 */
@@ -94,7 +102,9 @@ static inline void
 nlmsvc_remove_block(struct nlm_block *block)
 {
        if (!list_empty(&block->b_list)) {
+                spin_lock(&nlm_blocked_lock);
                list_del_init(&block->b_list);
+                spin_unlock(&nlm_blocked_lock);
                nlmsvc_release_block(block);
        }
 }
@@ -224,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
 failed_free:
        kfree(block);
 failed:
-        nlm_release_call(call);
+        nlmsvc_release_call(call);
        return NULL;
 }
@@ -257,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref)
        mutex_unlock(&file->f_mutex);
        nlmsvc_freegrantargs(block->b_call);
-        nlm_release_call(block->b_call);
+        nlmsvc_release_call(block->b_call);
        nlm_release_file(block->b_file);
        kfree(block->b_fl);
        kfree(block);
@@ -651,7 +661,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
        struct nlm_block *block;
        int rc = -ENOENT;
-        lock_kernel();
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
                        dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
@@ -665,13 +675,13 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
                        } else if (result == 0)
                                block->b_granted = 1;
-                        nlmsvc_insert_block(block, 0);
+                        nlmsvc_insert_block_locked(block, 0);
                        svc_wake_up(block->b_daemon);
                        rc = 0;
                        break;
                }
        }
-        unlock_kernel();
+        spin_unlock(&nlm_blocked_lock);
        if (rc == -ENOENT)
                printk(KERN_WARNING "lockd: grant for unknown block\n");
        return rc;
@@ -690,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl)
        struct nlm_block        *block;
        dprintk("lockd: VFS unblock notification for block %p\n", fl);
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
-                        nlmsvc_insert_block(block, 0);
+                        nlmsvc_insert_block_locked(block, 0);
+                        spin_unlock(&nlm_blocked_lock);
                        svc_wake_up(block->b_daemon);
                        return;
                }
        }
+        spin_unlock(&nlm_blocked_lock);
        printk(KERN_WARNING "lockd: notification for unknown block!\n");
 }
@@ -803,7 +815,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
        dprintk("lockd: GRANT_MSG RPC callback\n");
-        lock_kernel();
+        spin_lock(&nlm_blocked_lock);
        /* if the block is not on a list at this point then it has
         * been invalidated. Don't try to requeue it.
         *
@@ -825,19 +837,20 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
                /* Call was successful, now wait for client callback */
                timeout = 60 * HZ;
        }
-        nlmsvc_insert_block(block, timeout);
+        nlmsvc_insert_block_locked(block, timeout);
        svc_wake_up(block->b_daemon);
 out:
-        unlock_kernel();
+        spin_unlock(&nlm_blocked_lock);
 }
+/*
+ * FIXME: nlmsvc_release_block() grabs a mutex.  This is not allowed for an
+ * .rpc_release rpc_call_op
+ */
 static void nlmsvc_grant_release(void *data)
 {
        struct nlm_rqst         *call = data;
-        lock_kernel();
        nlmsvc_release_block(call->a_block);
-        unlock_kernel();
 }
 static const struct rpc_call_ops nlmsvc_grant_ops = {
@@ -922,3 +935,32 @@ nlmsvc_retry_blocked(void)
        return timeout;
 }
+#ifdef RPC_DEBUG
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+        /*
+         * We can get away with a static buffer because we're only
+         * called with BKL held.
+         */
+        static char buf[2*NLM_MAXCOOKIELEN+1];
+        unsigned int i, len = sizeof(buf);
+        char *p = buf;
+        len--;  /* allow for trailing \0 */
+        if (len < 3)
+                return "???";
+        for (i = 0 ; i < cookie->len ; i++) {
+                if (len < 2) {
+                        strcpy(p-3, "...");
+                        break;
+                }
+                sprintf(p, "%02x", cookie->data[i]);
+                p += 2;
+                len -= 2;
+        }
+        *p = '\0';
+        return buf;
+}
+#endif
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0f2ab741ae7c..d27aab11f324 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
@@ -81,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 no_locks:
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        if (error)
                return error;
        return nlm_lck_denied_nolocks;
@@ -123,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
                dprintk("lockd: TEST          status %d vers %d\n",
                        ntohl(resp->status), rqstp->rq_vers);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -165,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -195,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
        dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -228,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
        dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -258,11 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
                        -task->tk_status);
 }
+void nlmsvc_release_call(struct nlm_rqst *call)
+{
+        if (!atomic_dec_and_test(&call->a_count))
+                return;
+        nlmsvc_release_host(call->a_host);
+        kfree(call);
+}
 static void nlmsvc_callback_release(void *data)
 {
-        lock_kernel();
+        nlmsvc_release_call(data);
-        nlm_release_call(data);
-        unlock_kernel();
 }
 static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -294,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
        stat = func(rqstp, argp, &call->a_res);
        if (stat != 0) {
-                nlm_release_call(call);
+                nlmsvc_release_call(call);
                return stat;
        }
@@ -369,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_share_file(host, file, argp));
        dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -402,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
        dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -434,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                return rpc_success;
        nlmsvc_free_host_resources(host);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        return rpc_success;
 }
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d0ef94cfb3da..1ca0679c80bf 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 again:
        file->f_locks = 0;
+        lock_flocks(); /* protects i_flock list */
        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
                if (fl->fl_lmops != &nlmsvc_lock_operations)
                        continue;
@@ -181,6 +182,7 @@ again:
                if (match(lockhost, host)) {
                        struct file_lock lock = *fl;
+                        unlock_flocks();
                        lock.fl_type  = F_UNLCK;
                        lock.fl_start = 0;
                        lock.fl_end   = OFFSET_MAX;
@@ -192,6 +194,7 @@ again:
                        goto again;
                }
        }
+        unlock_flocks();
        return 0;
 }
@@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file)
        if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
                return 1;
+        lock_flocks();
        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
-                if (fl->fl_lmops == &nlmsvc_lock_operations)
+                if (fl->fl_lmops == &nlmsvc_lock_operations) {
+                        unlock_flocks();
                        return 1;
+                }
        }
+        unlock_flocks();
        file->f_locks = 0;
        return 0;
 }
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index b583ab0a4cbb..964666c68a86 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -149,37 +149,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-        struct file_lock        *fl = &lock->fl;
-        __s32                   start, len;
-        if (!(p = xdr_encode_string(p, lock->caller))
-         || !(p = nlm_encode_fh(p, &lock->fh))
-         || !(p = nlm_encode_oh(p, &lock->oh)))
-                return NULL;
-        if (fl->fl_start > NLM_OFFSET_MAX
-         || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-                return NULL;
-        start = loff_t_to_s32(fl->fl_start);
-        if (fl->fl_end == OFFSET_MAX)
-                len = 0;
-        else
-                len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
-        *p++ = htonl(lock->svid);
-        *p++ = htonl(start);
-        *p++ = htonl(len);
-        return p;
-}
-/*
 * Encode result of a TEST/TEST_MSG call
 */
 static __be32 *
@@ -372,259 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
-{
-        return 0;
-}
-#endif
-static int
-nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        if (resp->status == nlm_lck_denied) {
-                struct file_lock        *fl = &resp->lock.fl;
-                u32                     excl;
-                s32                     start, len, end;
-                memset(&resp->lock, 0, sizeof(resp->lock));
-                locks_init_lock(fl);
-                excl = ntohl(*p++);
-                resp->lock.svid = ntohl(*p++);
-                fl->fl_pid = (pid_t)resp->lock.svid;
-                if (!(p = nlm_decode_oh(p, &resp->lock.oh)))
-                        return -EIO;
-                fl->fl_flags = FL_POSIX;
-                fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-                start = ntohl(*p++);
-                len = ntohl(*p++);
-                end = start + len - 1;
-                fl->fl_start = s32_to_loff_t(start);
-                if (len == 0 || end < 0)
-                        fl->fl_end = OFFSET_MAX;
-                else
-                        fl->fl_end = s32_to_loff_t(end);
-        }
-        return 0;
-}
-static int
-nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        *p++ = argp->reclaim? xdr_one : xdr_zero;
-        *p++ = htonl(argp->state);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_encode_cookie(p, &resp->cookie)))
-                return -EIO;
-        *p++ = resp->status;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_encode_testres(p, resp)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        return 0;
-}
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-/*
- * Buffer requirements for NLM
- */
-#define NLM_void_sz             0
-#define NLM_cookie_sz           1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_owner_sz            1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_fhandle_sz          1+XDR_QUADLEN(NFS2_FHSIZE)
-#define NLM_lock_sz             3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
-#define NLM_holder_sz           4+NLM_owner_sz
-#define NLM_testargs_sz         NLM_cookie_sz+1+NLM_lock_sz
-#define NLM_lockargs_sz         NLM_cookie_sz+4+NLM_lock_sz
-#define NLM_cancargs_sz         NLM_cookie_sz+2+NLM_lock_sz
-#define NLM_unlockargs_sz       NLM_cookie_sz+NLM_lock_sz
-#define NLM_testres_sz          NLM_cookie_sz+1+NLM_holder_sz
-#define NLM_res_sz              NLM_cookie_sz+1
-#define NLM_norep_sz            0
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlmclt_decode_norep     NULL
-#define PROC(proc, argtype, restype)    \
-[NLMPROC_##proc] = {                                                    \
-        .p_proc      = NLMPROC_##proc,                                  \
-        .p_encode    = (kxdrproc_t) nlmclt_encode_##argtype,            \
-        .p_decode    = (kxdrproc_t) nlmclt_decode_##restype,            \
-        .p_arglen    = NLM_##argtype##_sz,                              \
-        .p_replen    = NLM_##restype##_sz,                              \
-        .p_statidx   = NLMPROC_##proc,                                  \
-        .p_name      = #proc,                                           \
-        }
-static struct rpc_procinfo      nlm_procedures[] = {
-    PROC(TEST,          testargs,       testres),
-    PROC(LOCK,          lockargs,       res),
-    PROC(CANCEL,        cancargs,       res),
-    PROC(UNLOCK,        unlockargs,     res),
-    PROC(GRANTED,       testargs,       res),
-    PROC(TEST_MSG,      testargs,       norep),
-    PROC(LOCK_MSG,      lockargs,       norep),
-    PROC(CANCEL_MSG,    cancargs,       norep),
-    PROC(UNLOCK_MSG,    unlockargs,     norep),
-    PROC(GRANTED_MSG,   testargs,       norep),
-    PROC(TEST_RES,      testres,        norep),
-    PROC(LOCK_RES,      res,            norep),
-    PROC(CANCEL_RES,    res,            norep),
-    PROC(UNLOCK_RES,    res,            norep),
-    PROC(GRANTED_RES,   res,            norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,         shareargs,      shareres),
-    PROC(UNSHARE,       shareargs,      shareres),
-    PROC(NM_LOCK,       lockargs,       res),
-    PROC(FREE_ALL,      notify,         void),
-#endif
-};
-static struct rpc_version       nlm_version1 = {
-                .number         = 1,
-                .nrprocs        = 16,
-                .procs          = nlm_procedures,
-};
-static struct rpc_version       nlm_version3 = {
-                .number         = 3,
-                .nrprocs        = 24,
-                .procs          = nlm_procedures,
-};
-static struct rpc_version *     nlm_versions[] = {
-        [1] = &nlm_version1,
-        [3] = &nlm_version3,
-#ifdef  CONFIG_LOCKD_V4
-        [4] = &nlm_version4,
-#endif
-};
-static struct rpc_stat          nlm_stats;
-struct rpc_program              nlm_program = {
-                .name           = "lockd",
-                .number         = NLM_PROGRAM,
-                .nrvers         = ARRAY_SIZE(nlm_versions),
-                .version        = nlm_versions,
-                .stats          = &nlm_stats,
-};
-#ifdef RPC_DEBUG
-const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
-{
-        /*
-         * We can get away with a static buffer because we're only
-         * called with BKL held.
-         */
-        static char buf[2*NLM_MAXCOOKIELEN+1];
-        unsigned int i, len = sizeof(buf);
-        char *p = buf;
-        len--;  /* allow for trailing \0 */
-        if (len < 3)
-                return "???";
-        for (i = 0 ; i < cookie->len ; i++) {
-                if (len < 2) {
-                        strcpy(p-3, "...");
-                        break;
-                }
-                sprintf(p, "%02x", cookie->data[i]);
-                p += 2;
-                len -= 2;
-        }
-        *p = '\0';
-        return buf;
-}
-#endif
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ad9dbbc9145d..dfa4789cd460 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -93,15 +93,6 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
        return p + XDR_QUADLEN(f->size);
 }
-static __be32 *
-nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
-{
-        *p++ = htonl(f->size);
-        if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
-        memcpy(p, f->data, f->size);
-        return p + XDR_QUADLEN(f->size);
-}
 /*
 * Encode and decode owner handle
 */
@@ -112,12 +103,6 @@ nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
 }
 static __be32 *
-nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
-{
-        return xdr_encode_netobj(p, oh);
-}
-static __be32 *
 nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
        struct file_lock        *fl = &lock->fl;
@@ -150,38 +135,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-        struct file_lock        *fl = &lock->fl;
-        __s64                   start, len;
-        if (!(p = xdr_encode_string(p, lock->caller))
-         || !(p = nlm4_encode_fh(p, &lock->fh))
-         || !(p = nlm4_encode_oh(p, &lock->oh)))
-                return NULL;
-        if (fl->fl_start > NLM4_OFFSET_MAX
-         || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-                return NULL;
-        *p++ = htonl(lock->svid);
-        start = loff_t_to_s64(fl->fl_start);
-        if (fl->fl_end == OFFSET_MAX)
-                len = 0;
-        else
-                len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
-        p = xdr_encode_hyper(p, start);
-        p = xdr_encode_hyper(p, len);
-        return p;
-}
-/*
 * Encode result of a TEST/TEST_MSG call
 */
 static __be32 *
@@ -379,211 +332,3 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
-{
-        return 0;
-}
-#endif
-static int
-nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        if (resp->status == nlm_lck_denied) {
-                struct file_lock        *fl = &resp->lock.fl;
-                u32                     excl;
-                __u64                   start, len;
-                __s64                   end;
-                memset(&resp->lock, 0, sizeof(resp->lock));
-                locks_init_lock(fl);
-                excl = ntohl(*p++);
-                resp->lock.svid = ntohl(*p++);
-                fl->fl_pid = (pid_t)resp->lock.svid;
-                if (!(p = nlm4_decode_oh(p, &resp->lock.oh)))
-                        return -EIO;
-                fl->fl_flags = FL_POSIX;
-                fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-                p = xdr_decode_hyper(p, &start);
-                p = xdr_decode_hyper(p, &len);
-                end = start + len - 1;
-                fl->fl_start = s64_to_loff_t(start);
-                if (len == 0 || end < 0)
-                        fl->fl_end = OFFSET_MAX;
-                else
-                        fl->fl_end = s64_to_loff_t(end);
-        }
-        return 0;
-}
-static int
-nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        *p++ = argp->reclaim? xdr_one : xdr_zero;
-        *p++ = htonl(argp->state);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
-                return -EIO;
-        *p++ = resp->status;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_encode_testres(p, resp)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        return 0;
-}
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
-#  error "NLM host name cannot be larger than NLM's maximum string length!"
-#endif
-/*
- * Buffer requirements for NLM
- */
-#define NLM4_void_sz            0
-#define NLM4_cookie_sz          1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM4_caller_sz          1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_owner_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_fhandle_sz         1+XDR_QUADLEN(NFS3_FHSIZE)
-#define NLM4_lock_sz            5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
-#define NLM4_holder_sz          6+NLM4_owner_sz
-#define NLM4_testargs_sz        NLM4_cookie_sz+1+NLM4_lock_sz
-#define NLM4_lockargs_sz        NLM4_cookie_sz+4+NLM4_lock_sz
-#define NLM4_cancargs_sz        NLM4_cookie_sz+2+NLM4_lock_sz
-#define NLM4_unlockargs_sz      NLM4_cookie_sz+NLM4_lock_sz
-#define NLM4_testres_sz         NLM4_cookie_sz+1+NLM4_holder_sz
-#define NLM4_res_sz             NLM4_cookie_sz+1
-#define NLM4_norep_sz           0
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlm4clt_decode_norep    NULL
-#define PROC(proc, argtype, restype)                                    \
-[NLMPROC_##proc] = {                                                    \
-        .p_proc      = NLMPROC_##proc,                                  \
-        .p_encode    = (kxdrproc_t) nlm4clt_encode_##argtype,           \
-        .p_decode    = (kxdrproc_t) nlm4clt_decode_##restype,           \
-        .p_arglen    = NLM4_##argtype##_sz,                             \
-        .p_replen    = NLM4_##restype##_sz,                             \
-        .p_statidx   = NLMPROC_##proc,                                  \
-        .p_name      = #proc,                                           \
-        }
-static struct rpc_procinfo      nlm4_procedures[] = {
-    PROC(TEST,          testargs,       testres),
-    PROC(LOCK,          lockargs,       res),
-    PROC(CANCEL,        cancargs,       res),
-    PROC(UNLOCK,        unlockargs,     res),
-    PROC(GRANTED,       testargs,       res),
-    PROC(TEST_MSG,      testargs,       norep),
-    PROC(LOCK_MSG,      lockargs,       norep),
-    PROC(CANCEL_MSG,    cancargs,       norep),
-    PROC(UNLOCK_MSG,    unlockargs,     norep),
-    PROC(GRANTED_MSG,   testargs,       norep),
-    PROC(TEST_RES,      testres,        norep),
-    PROC(LOCK_RES,      res,            norep),
-    PROC(CANCEL_RES,    res,            norep),
-    PROC(UNLOCK_RES,    res,            norep),
-    PROC(GRANTED_RES,   res,            norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,         shareargs,      shareres),
-    PROC(UNSHARE,       shareargs,      shareres),
-    PROC(NM_LOCK,       lockargs,       res),
-    PROC(FREE_ALL,      notify,         void),
-#endif
-};
-struct rpc_version      nlm_version4 = {
-        .number         = 4,
-        .nrprocs        = 24,
-        .procs          = nlm4_procedures,
-};
diff --git a/fs/locks.c b/fs/locks.c
index ab24d49fc048..0f3998291f78 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -122,7 +122,6 @@
 #include <linux/module.h>
 #include <linux/security.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/rcupdate.h>
@@ -142,14 +141,32 @@ int lease_break_time = 45;
 static LIST_HEAD(file_lock_list);
 static LIST_HEAD(blocked_list);
+static DEFINE_SPINLOCK(file_lock_lock);
+/*
+ * Protects the two list heads above, plus the inode->i_flock list
+ * FIXME: should use a spinlock, once lockd and ceph are ready.
+ */
+void lock_flocks(void)
+{
+        spin_lock(&file_lock_lock);
+}
+EXPORT_SYMBOL_GPL(lock_flocks);
+void unlock_flocks(void)
+{
+        spin_unlock(&file_lock_lock);
+}
+EXPORT_SYMBOL_GPL(unlock_flocks);
 static struct kmem_cache *filelock_cache __read_mostly;
 /* Allocate an empty lock structure. */
-static struct file_lock *locks_alloc_lock(void)
+struct file_lock *locks_alloc_lock(void)
 {
        return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
 }
+EXPORT_SYMBOL_GPL(locks_alloc_lock);
 void locks_release_private(struct file_lock *fl)
 {
@@ -168,7 +185,7 @@ void locks_release_private(struct file_lock *fl)
 EXPORT_SYMBOL_GPL(locks_release_private);
 /* Free a lock which is not in use. */
-static void locks_free_lock(struct file_lock *fl)
+void locks_free_lock(struct file_lock *fl)
 {
        BUG_ON(waitqueue_active(&fl->fl_wait));
        BUG_ON(!list_empty(&fl->fl_block));
@@ -177,6 +194,7 @@ static void locks_free_lock(struct file_lock *fl)
        locks_release_private(fl);
        kmem_cache_free(filelock_cache, fl);
 }
+EXPORT_SYMBOL(locks_free_lock);
 void locks_init_lock(struct file_lock *fl)
 {
@@ -216,11 +234,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
                        fl->fl_ops->fl_copy_lock(new, fl);
                new->fl_ops = fl->fl_ops;
        }
-        if (fl->fl_lmops) {
+        if (fl->fl_lmops)
-                if (fl->fl_lmops->fl_copy_lock)
-                        fl->fl_lmops->fl_copy_lock(new, fl);
                new->fl_lmops = fl->fl_lmops;
-        }
 }
 /*
@@ -429,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
        fl->fl_file->f_owner.signum = 0;
 }
-static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
-{
-        return fl->fl_file == try->fl_file;
-}
 static const struct lock_manager_operations lease_manager_ops = {
        .fl_break = lease_break_callback,
        .fl_release_private = lease_release_private_callback,
-        .fl_mylease = lease_mylease_callback,
        .fl_change = lease_modify,
 };
@@ -511,9 +520,9 @@ static void __locks_delete_block(struct file_lock *waiter)
 */
 static void locks_delete_block(struct file_lock *waiter)
 {
-        lock_kernel();
+        lock_flocks();
        __locks_delete_block(waiter);
-        unlock_kernel();
+        unlock_flocks();
 }
 /* Insert waiter into blocker's block list.
@@ -644,7 +653,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 {
        struct file_lock *cfl;
-        lock_kernel();
+        lock_flocks();
        for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
                if (!IS_POSIX(cfl))
                        continue;
@@ -657,7 +666,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
                        fl->fl_pid = pid_vnr(cfl->fl_nspid);
        } else
                fl->fl_type = F_UNLCK;
-        unlock_kernel();
+        unlock_flocks();
        return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -730,18 +739,16 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
        int error = 0;
        int found = 0;
-        lock_kernel();
+        if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
-        if (request->fl_flags & FL_ACCESS)
-                goto find_conflict;
-        if (request->fl_type != F_UNLCK) {
-                error = -ENOMEM;
                new_fl = locks_alloc_lock();
-                if (new_fl == NULL)
+                if (!new_fl)
-                        goto out;
+                        return -ENOMEM;
-                error = 0;
        }
+        lock_flocks();
+        if (request->fl_flags & FL_ACCESS)
+                goto find_conflict;
        for_each_lock(inode, before) {
                struct file_lock *fl = *before;
                if (IS_POSIX(fl))
@@ -767,8 +774,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
         * If a higher-priority process was blocked on the old file lock,
         * give it the opportunity to lock the file.
         */
-        if (found)
+        if (found) {
+                unlock_flocks();
                cond_resched();
+                lock_flocks();
+        }
 find_conflict:
        for_each_lock(inode, before) {
@@ -794,7 +804,7 @@ find_conflict:
        error = 0;
 out:
-        unlock_kernel();
+        unlock_flocks();
        if (new_fl)
                locks_free_lock(new_fl);
        return error;
@@ -823,7 +833,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                new_fl2 = locks_alloc_lock();
        }
-        lock_kernel();
+        lock_flocks();
        if (request->fl_type != F_UNLCK) {
                for_each_lock(inode, before) {
                        fl = *before;
@@ -991,7 +1001,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                locks_wake_up_blocks(left);
        }
 out:
-        unlock_kernel();
+        unlock_flocks();
        /*
         * Free any unused locks.
         */
@@ -1066,14 +1076,14 @@ int locks_mandatory_locked(struct inode *inode)
        /*
         * Search the lock list for this inode for any POSIX locks.
         */
-        lock_kernel();
+        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!IS_POSIX(fl))
                        continue;
                if (fl->fl_owner != owner)
                        break;
        }
-        unlock_kernel();
+        unlock_flocks();
        return fl ? -EAGAIN : 0;
 }
@@ -1186,7 +1196,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
        new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
-        lock_kernel();
+        lock_flocks();
        time_out_leases(inode);
@@ -1247,8 +1257,10 @@ restart:
                        break_time++;
        }
        locks_insert_block(flock, new_fl);
+        unlock_flocks();
        error = wait_event_interruptible_timeout(new_fl->fl_wait,
                                                !new_fl->fl_next, break_time);
+        lock_flocks();
        __locks_delete_block(new_fl);
        if (error >= 0) {
                if (error == 0)
@@ -1263,7 +1275,7 @@ restart:
        }
 out:
-        unlock_kernel();
+        unlock_flocks();
        if (!IS_ERR(new_fl))
                locks_free_lock(new_fl);
        return error;
@@ -1319,7 +1331,7 @@ int fcntl_getlease(struct file *filp)
        struct file_lock *fl;
        int type = F_UNLCK;
-        lock_kernel();
+        lock_flocks();
        time_out_leases(filp->f_path.dentry->d_inode);
        for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
                        fl = fl->fl_next) {
@@ -1328,7 +1340,7 @@ int fcntl_getlease(struct file *filp)
                        break;
                }
        }
-        unlock_kernel();
+        unlock_flocks();
        return type;
 }
@@ -1341,41 +1353,37 @@ int fcntl_getlease(struct file *filp)
 *      The (input) flp->fl_lmops->fl_break function is required
 *      by break_lease().
 *
- *      Called with kernel lock held.
+ *      Called with file_lock_lock held.
 */
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
        struct file_lock *fl, **before, **my_before = NULL, *lease;
-        struct file_lock *new_fl = NULL;
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        int error, rdlease_count = 0, wrlease_count = 0;
+        lease = *flp;
+        error = -EACCES;
        if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
-                return -EACCES;
+                goto out;
+        error = -EINVAL;
        if (!S_ISREG(inode->i_mode))
-                return -EINVAL;
+                goto out;
        error = security_file_lock(filp, arg);
        if (error)
-                return error;
+                goto out;
        time_out_leases(inode);
        BUG_ON(!(*flp)->fl_lmops->fl_break);
-        lease = *flp;
        if (arg != F_UNLCK) {
-                error = -ENOMEM;
-                new_fl = locks_alloc_lock();
-                if (new_fl == NULL)
-                        goto out;
                error = -EAGAIN;
                if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                        goto out;
                if ((arg == F_WRLCK)
-                    && ((atomic_read(&dentry->d_count) > 1)
+                    && ((dentry->d_count > 1)
                        || (atomic_read(&inode->i_count) > 1)))
                        goto out;
        }
@@ -1391,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
        for (before = &inode->i_flock;
                        ((fl = *before) != NULL) && IS_LEASE(fl);
                        before = &fl->fl_next) {
-                if (lease->fl_lmops->fl_mylease(fl, lease))
+                if (fl->fl_file == filp)
                        my_before = before;
                else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
                        /*
@@ -1410,12 +1418,12 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
                goto out;
        if (my_before != NULL) {
-                *flp = *my_before;
                error = lease->fl_lmops->fl_change(my_before, arg);
+                if (!error)
+                        *flp = *my_before;
                goto out;
        }
-        error = 0;
        if (arg == F_UNLCK)
                goto out;
@@ -1423,20 +1431,23 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
        if (!leases_enable)
                goto out;
-        locks_copy_lock(new_fl, lease);
+        locks_insert_lock(before, lease);
-        locks_insert_lock(before, new_fl);
-        *flp = new_fl;
        return 0;
 out:
-        if (new_fl != NULL)
-                locks_free_lock(new_fl);
        return error;
 }
 EXPORT_SYMBOL(generic_setlease);
- /**
+static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
+{
+        if (filp->f_op && filp->f_op->setlease)
+                return filp->f_op->setlease(filp, arg, lease);
+        else
+                return generic_setlease(filp, arg, lease);
+}
+/**
 *      vfs_setlease        -       sets a lease on an open file
 *      @filp: file pointer
 *      @arg: type of lease to obtain
@@ -1467,17 +1478,67 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 {
        int error;
-        lock_kernel();
+        lock_flocks();
-        if (filp->f_op && filp->f_op->setlease)
+        error = __vfs_setlease(filp, arg, lease);
-                error = filp->f_op->setlease(filp, arg, lease);
+        unlock_flocks();
-        else
-                error = generic_setlease(filp, arg, lease);
-        unlock_kernel();
        return error;
 }
 EXPORT_SYMBOL_GPL(vfs_setlease);
+static int do_fcntl_delete_lease(struct file *filp)
+{
+        struct file_lock fl, *flp = &fl;
+        lease_init(filp, F_UNLCK, flp);
+        return vfs_setlease(filp, F_UNLCK, &flp);
+}
+static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
+{
+        struct file_lock *fl, *ret;
+        struct fasync_struct *new;
+        int error;
+        fl = lease_alloc(filp, arg);
+        if (IS_ERR(fl))
+                return PTR_ERR(fl);
+        new = fasync_alloc();
+        if (!new) {
+                locks_free_lock(fl);
+                return -ENOMEM;
+        }
+        ret = fl;
+        lock_flocks();
+        error = __vfs_setlease(filp, arg, &ret);
+        if (error) {
+                unlock_flocks();
+                locks_free_lock(fl);
+                goto out_free_fasync;
+        }
+        if (ret != fl)
+                locks_free_lock(fl);
+        /*
+         * fasync_insert_entry() returns the old entry if any.
+         * If there was no old entry, then it used 'new' and
+         * inserted it into the fasync list. Clear new so that
+         * we don't release it here.
+         */
+        if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
+                new = NULL;
+        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+        unlock_flocks();
+out_free_fasync:
+        if (new)
+                fasync_free(new);
+        return error;
+}
 /**
 *      fcntl_setlease  -       sets a lease on an open file
 *      @fd: open file descriptor
@@ -1490,34 +1551,9 @@ EXPORT_SYMBOL_GPL(vfs_setlease);
 */
 int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 {
-        struct file_lock fl, *flp = &fl;
+        if (arg == F_UNLCK)
-        struct inode *inode = filp->f_path.dentry->d_inode;
+                return do_fcntl_delete_lease(filp);
-        int error;
+        return do_fcntl_add_lease(fd, filp, arg);
-        locks_init_lock(&fl);
-        error = lease_init(filp, arg, &fl);
-        if (error)
-                return error;
-        lock_kernel();
-        error = vfs_setlease(filp, arg, &flp);
-        if (error || arg == F_UNLCK)
-                goto out_unlock;
-        error = fasync_helper(fd, filp, 1, &flp->fl_fasync);
-        if (error < 0) {
-                /* remove lease just inserted by setlease */
-                flp->fl_type = F_UNLCK | F_INPROGRESS;
-                flp->fl_break_time = jiffies - 10;
-                time_out_leases(inode);
-                goto out_unlock;
-        }
-        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-out_unlock:
-        unlock_kernel();
-        return error;
 }
 /**
@@ -2020,7 +2056,7 @@ void locks_remove_flock(struct file *filp)
                        fl.fl_ops->fl_release_private(&fl);
        }
-        lock_kernel();
+        lock_flocks();
        before = &inode->i_flock;
        while ((fl = *before) != NULL) {
@@ -2038,7 +2074,7 @@ void locks_remove_flock(struct file *filp)
                }
                before = &fl->fl_next;
        }
-        unlock_kernel();
+        unlock_flocks();
 }
 /**
@@ -2053,12 +2089,12 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
 {
        int status = 0;
-        lock_kernel();
+        lock_flocks();
        if (waiter->fl_next)
                __locks_delete_block(waiter);
        else
                status = -ENOENT;
-        unlock_kernel();
+        unlock_flocks();
        return status;
 }
@@ -2085,7 +2121,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 #include <linux/seq_file.h>
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
-                                                        int id, char *pfx)
+                            loff_t id, char *pfx)
 {
        struct inode *inode = NULL;
        unsigned int fl_pid;
@@ -2098,7 +2134,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
        if (fl->fl_file != NULL)
                inode = fl->fl_file->f_path.dentry->d_inode;
-        seq_printf(f, "%d:%s ", id, pfx);
+        seq_printf(f, "%lld:%s ", id, pfx);
        if (IS_POSIX(fl)) {
                seq_printf(f, "%6s %s ",
                             (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
@@ -2161,30 +2197,33 @@ static int locks_show(struct seq_file *f, void *v)
        fl = list_entry(v, struct file_lock, fl_link);
-        lock_get_status(f, fl, (long)f->private, "");
+        lock_get_status(f, fl, *((loff_t *)f->private), "");
        list_for_each_entry(bfl, &fl->fl_block, fl_block)
-                lock_get_status(f, bfl, (long)f->private, " ->");
+                lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
-        f->private++;
        return 0;
 }
 static void *locks_start(struct seq_file *f, loff_t *pos)
 {
-        lock_kernel();
+        loff_t *p = f->private;
-        f->private = (void *)1;
+        lock_flocks();
+        *p = (*pos + 1);
        return seq_list_start(&file_lock_list, *pos);
 }
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 {
+        loff_t *p = f->private;
+        ++*p;
        return seq_list_next(v, &file_lock_list, pos);
 }
 static void locks_stop(struct seq_file *f, void *v)
 {
-        unlock_kernel();
+        unlock_flocks();
 }
 static const struct seq_operations locks_seq_operations = {
@@ -2196,14 +2235,14 @@ static const struct seq_operations locks_seq_operations = {
 static int locks_open(struct inode *inode, struct file *filp)
 {
-        return seq_open(filp, &locks_seq_operations);
+        return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
 }
 static const struct file_operations proc_locks_operations = {
        .open           = locks_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release,
+        .release        = seq_release_private,
 };
 static int __init proc_locks_init(void)
@@ -2231,7 +2270,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 {
        struct file_lock *fl;
        int result = 1;
-        lock_kernel();
+        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (IS_POSIX(fl)) {
                        if (fl->fl_type == F_RDLCK)
@@ -2248,7 +2287,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
                result = 0;
                break;
        }
-        unlock_kernel();
+        unlock_flocks();
        return result;
 }
@@ -2271,7 +2310,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 {
        struct file_lock *fl;
        int result = 1;
-        lock_kernel();
+        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (IS_POSIX(fl)) {
                        if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2286,7 +2325,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
                result = 0;
                break;
        }
-        unlock_kernel();
+        unlock_flocks();
        return result;
 }
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9bd2ce2a3040..723bc5bca09a 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -298,9 +298,9 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
        return sync_request(page, bdev, WRITE);
 }
-static void bdev_put_device(struct super_block *sb)
+static void bdev_put_device(struct logfs_super *s)
 {
-        close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
+        blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -320,20 +320,24 @@ static const struct logfs_device_ops bd_devops = {
        .put_device     = bdev_put_device,
 };
-int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
-                const char *devname, struct vfsmount *mnt)
+                const char *devname)
 {
        struct block_device *bdev;
-        bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+        bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                  type);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
        if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
                int mtdnr = MINOR(bdev->bd_dev);
-                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+                blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-                return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+                return logfs_get_sb_mtd(p, mtdnr);
        }
-        return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
+        p->s_bdev = bdev;
+        p->s_mtd = NULL;
+        p->s_devops = &bd_devops;
+        return 0;
 }
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index a85d47d13e4b..7466e9dcc8c5 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -230,9 +230,9 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
        __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
 }
-static void mtd_put_device(struct super_block *sb)
+static void mtd_put_device(struct logfs_super *s)
 {
-        put_mtd_device(logfs_super(sb)->s_mtd);
+        put_mtd_device(s->s_mtd);
 }
 static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
@@ -265,14 +265,14 @@ static const struct logfs_device_ops mtd_devops = {
        .put_device     = mtd_put_device,
 };
-int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-                int mtdnr, struct vfsmount *mnt)
 {
-        struct mtd_info *mtd;
+        struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
-        const struct logfs_device_ops *devops = &mtd_devops;
-        mtd = get_mtd_device(NULL, mtdnr);
        if (IS_ERR(mtd))
                return PTR_ERR(mtd);
-        return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
+        s->s_bdev = NULL;
+        s->s_mtd = mtd;
+        s->s_devops = &mtd_devops;
+        return 0;
 }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9777eb5b5522..f9ddf0c388c8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -555,9 +555,11 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
        return __logfs_create(dir, dentry, inode, target, destlen);
 }
-static int logfs_permission(struct inode *inode, int mask)
+static int logfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        return generic_permission(inode, mask, NULL);
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        return generic_permission(inode, mask, flags, NULL);
 }
 static int logfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -569,7 +571,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
                return -EMLINK;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_nlink++;
        mark_inode_dirty_sync(inode);
@@ -827,4 +829,5 @@ const struct file_operations logfs_dir_fops = {
        .unlocked_ioctl = logfs_ioctl,
        .readdir        = logfs_readdir,
        .read           = generic_read_dir,
+        .llseek         = default_llseek,
 };
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index d8c71ece098f..03b8c240aeda 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -141,13 +141,20 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
        return __logfs_iget(sb, ino);
 }
+static void logfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+}
 static void __logfs_destroy_inode(struct inode *inode)
 {
        struct logfs_inode *li = logfs_inode(inode);
        BUG_ON(li->li_block);
        list_del(&li->li_freeing_list);
-        kmem_cache_free(logfs_inode_cache, li);
+        call_rcu(&inode->i_rcu, logfs_i_callback);
 }
 static void logfs_destroy_inode(struct inode *inode)
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index f46ee8b0e135..9da29706f91c 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
                super->s_journal_seg[i] = segno;
                super->s_journal_ec[i] = ec;
                logfs_set_segment_reserved(sb, segno);
-                err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+                err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
                BUG_ON(err); /* mempool should prevent this */
                err = logfs_erase_segment(sb, segno, 1);
                BUG_ON(err); /* FIXME: remount-ro would be nicer */
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index b8786264d243..57afd4a6fabb 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -136,6 +136,7 @@ struct logfs_area_ops {
        int     (*erase_segment)(struct logfs_area *area);
 };
+struct logfs_super;     /* forward */
 /**
 * struct logfs_device_ops - device access operations
 *
@@ -156,7 +157,7 @@ struct logfs_device_ops {
                        int ensure_write);
        int (*can_write_buf)(struct super_block *sb, u64 ofs);
        void (*sync)(struct super_block *sb);
-        void (*put_device)(struct super_block *sb);
+        void (*put_device)(struct logfs_super *s);
 };
 /**
@@ -471,11 +472,13 @@ void logfs_compr_exit(void);
 /* dev_bdev.c */
 #ifdef CONFIG_BLOCK
-int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+int logfs_get_sb_bdev(struct logfs_super *s,
-                const char *devname, struct vfsmount *mnt);
+                struct file_system_type *type,
+                const char *devname);
 #else
-static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+static inline int logfs_get_sb_bdev(struct logfs_super *s,
-                const char *devname, struct vfsmount *mnt)
+                struct file_system_type *type,
+                const char *devname)
 {
        return -ENODEV;
 }
@@ -483,11 +486,9 @@ static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
 /* dev_mtd.c */
 #ifdef CONFIG_MTD
-int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
-                int mtdnr, struct vfsmount *mnt);
 #else
-static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-                int mtdnr, struct vfsmount *mnt)
 {
        return -ENODEV;
 }
@@ -619,9 +620,6 @@ void emergency_read_end(struct page *page);
 void logfs_crash_dump(struct super_block *sb);
 void *memchr_inv(const void *s, int c, size_t n);
 int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
-int logfs_get_sb_device(struct file_system_type *type, int flags,
-                struct mtd_info *mtd, struct block_device *bdev,
-                const struct logfs_device_ops *devops, struct vfsmount *mnt);
 int logfs_check_ds(struct logfs_disk_super *ds);
 int logfs_write_sb(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 6127baf0e188..ee99a9f5dfd3 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode)
        /* FIXME: transaction is part of logfs_block now.  Is that enough? */
        err = logfs_write_buf(master_inode, page, 0);
+        if (err)
+                move_page_to_inode(inode, page);
        logfs_put_write_page(page);
        return err;
 }
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 5336155c5d81..33435e4b14d2 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -325,7 +325,7 @@ static int logfs_make_writeable(struct super_block *sb)
        return 0;
 }
-static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
+static int logfs_get_sb_final(struct super_block *sb)
 {
        struct logfs_super *super = logfs_super(sb);
        struct inode *rootdir;
@@ -356,7 +356,6 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
        }
        log_super("LogFS: Finished mounting\n");
-        simple_set_mnt(mnt, sb);
        return 0;
 fail:
@@ -529,43 +528,37 @@ static void logfs_kill_sb(struct super_block *sb)
        logfs_cleanup_rw(sb);
        if (super->s_erase_page)
                __free_page(super->s_erase_page);
-        super->s_devops->put_device(sb);
+        super->s_devops->put_device(super);
        logfs_mempool_destroy(super->s_btree_pool);
        logfs_mempool_destroy(super->s_alias_pool);
        kfree(super);
        log_super("LogFS: Finished unmounting\n");
 }
-int logfs_get_sb_device(struct file_system_type *type, int flags,
+static struct dentry *logfs_get_sb_device(struct logfs_super *super,
-                struct mtd_info *mtd, struct block_device *bdev,
+                struct file_system_type *type, int flags)
-                const struct logfs_device_ops *devops, struct vfsmount *mnt)
 {
-        struct logfs_super *super;
        struct super_block *sb;
        int err = -ENOMEM;
        static int mount_count;
        log_super("LogFS: Start mount %x\n", mount_count++);
-        super = kzalloc(sizeof(*super), GFP_KERNEL);
-        if (!super)
-                goto err0;
-        super->s_mtd    = mtd;
-        super->s_bdev   = bdev;
        err = -EINVAL;
        sb = sget(type, logfs_sb_test, logfs_sb_set, super);
-        if (IS_ERR(sb))
+        if (IS_ERR(sb)) {
-                goto err0;
+                super->s_devops->put_device(super);
+                kfree(super);
+                return ERR_CAST(sb);
+        }
        if (sb->s_root) {
                /* Device is already in use */
-                err = 0;
+                super->s_devops->put_device(super);
-                simple_set_mnt(mnt, sb);
+                kfree(super);
-                goto err0;
+                return dget(sb->s_root);
        }
-        super->s_devops = devops;
        /*
         * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
         * only covers 16TB and the upper 8TB are used for indirect blocks.
@@ -581,10 +574,12 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
                goto err1;
        sb->s_flags |= MS_ACTIVE;
-        err = logfs_get_sb_final(sb, mnt);
+        err = logfs_get_sb_final(sb);
-        if (err)
+        if (err) {
                deactivate_locked_super(sb);
-        return err;
+                return ERR_PTR(err);
+        }
+        return dget(sb->s_root);
 err1:
        /* no ->s_root, no ->put_super() */
@@ -592,37 +587,45 @@ err1:
        iput(super->s_segfile_inode);
        iput(super->s_mapping_inode);
        deactivate_locked_super(sb);
-        return err;
+        return ERR_PTR(err);
-err0:
-        kfree(super);
-        //devops->put_device(sb);
-        return err;
 }
-static int logfs_get_sb(struct file_system_type *type, int flags,
+static struct dentry *logfs_mount(struct file_system_type *type, int flags,
-                const char *devname, void *data, struct vfsmount *mnt)
+                const char *devname, void *data)
 {
        ulong mtdnr;
+        struct logfs_super *super;
+        int err;
-        if (!devname)
+        super = kzalloc(sizeof(*super), GFP_KERNEL);
-                return logfs_get_sb_bdev(type, flags, devname, mnt);
+        if (!super)
-        if (strncmp(devname, "mtd", 3))
+                return ERR_PTR(-ENOMEM);
-                return logfs_get_sb_bdev(type, flags, devname, mnt);
-        {
+        if (!devname)
+                err = logfs_get_sb_bdev(super, type, devname);
+        else if (strncmp(devname, "mtd", 3))
+                err = logfs_get_sb_bdev(super, type, devname);
+        else {
                char *garbage;
                mtdnr = simple_strtoul(devname+3, &garbage, 0);
                if (*garbage)
-                        return -EINVAL;
+                        err = -EINVAL;
+                else
+                        err = logfs_get_sb_mtd(super, mtdnr);
+        }
+        if (err) {
+                kfree(super);
+                return ERR_PTR(err);
        }
-        return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+        return logfs_get_sb_device(super, type, flags);
 }
 static struct file_system_type logfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "logfs",
-        .get_sb         = logfs_get_sb,
+        .mount          = logfs_mount,
        .kill_sb        = logfs_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV,
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 93444747237b..a25444ab2baf 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first);
 EXPORT_SYMBOL(mb_cache_entry_find_next);
 #endif
-struct mb_cache {
-        struct list_head                c_cache_list;
-        const char                      *c_name;
-        atomic_t                        c_entry_count;
-        int                             c_max_entries;
-        int                             c_bucket_bits;
-        struct kmem_cache               *c_entry_cache;
-        struct list_head                *c_block_hash;
-        struct list_head                *c_index_hash;
-};
 /*
 * Global data: list of all mbcache's, lru list, and a spinlock for
 * accessing cache data structures on SMP machines. The lru list is
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e39d6bf2e8fb..ae0b83f476a6 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,11 +68,18 @@ static struct inode *minix_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void minix_destroy_inode(struct inode *inode)
+static void minix_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
+static void minix_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, minix_i_callback);
+}
 static void init_once(void *foo)
 {
        struct minix_inode_info *ei = (struct minix_inode_info *) foo;
@@ -614,17 +621,16 @@ void minix_truncate(struct inode * inode)
                V2_minix_truncate(inode);
 }
-static int minix_get_sb(struct file_system_type *fs_type,
+static struct dentry *minix_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
-                           mnt);
 }
 static struct file_system_type minix_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "minix",
-        .get_sb         = minix_get_sb,
+        .mount          = minix_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f3f3578393a4..ce7337ddfdbf 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,8 +23,6 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
        struct inode * inode = NULL;
        ino_t ino;
-        dentry->d_op = dir->i_sb->s_root->d_op;
        if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
                return ERR_PTR(-ENAMETOOLONG);
@@ -101,7 +99,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return add_nondir(dentry, inode);
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea556..d78455a81ec9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
 * status of that page is hard.  See end_buffer_async_read() for the details.
 * There is no point in duplicating all that complexity.
 */
-static void mpage_end_io_read(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err)
 {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
                if (--bvec >= bio->bi_io_vec)
                        prefetchw(&bvec->bv_page->flags);
+                if (bio_data_dir(bio) == READ) {
-                if (uptodate) {
+                        if (uptodate) {
-                        SetPageUptodate(page);
+                                SetPageUptodate(page);
-                } else {
+                        } else {
-                        ClearPageUptodate(page);
+                                ClearPageUptodate(page);
-                        SetPageError(page);
+                                SetPageError(page);
-                }
+                        }
-                unlock_page(page);
+                        unlock_page(page);
-        } while (bvec >= bio->bi_io_vec);
+                } else { /* bio_data_dir(bio) == WRITE */
-        bio_put(bio);
+                        if (!uptodate) {
-}
+                                SetPageError(page);
+                                if (page->mapping)
-static void mpage_end_io_write(struct bio *bio, int err)
+                                        set_bit(AS_EIO, &page->mapping->flags);
-{
+                        }
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+                        end_page_writeback(page);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        do {
-                struct page *page = bvec->bv_page;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (!uptodate){
-                        SetPageError(page);
-                        if (page->mapping)
-                                set_bit(AS_EIO, &page->mapping->flags);
                }
-                end_page_writeback(page);
        } while (bvec >= bio->bi_io_vec);
        bio_put(bio);
 }
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
-        bio->bi_end_io = mpage_end_io_read;
+        bio->bi_end_io = mpage_end_io;
-        if (rw == WRITE)
-                bio->bi_end_io = mpage_end_io_write;
        submit_bio(rw, bio);
        return NULL;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 24896e833565..7d77f24d32a9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
 /*
 * This does basic POSIX ACL permission checking
 */
-static int acl_permission_check(struct inode *inode, int mask,
+static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
-                int (*check_acl)(struct inode *inode, int mask))
+                int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
        umode_t                 mode = inode->i_mode;
@@ -180,7 +180,7 @@ static int acl_permission_check(struct inode *inode, int mask,
                mode >>= 6;
        else {
                if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
-                        int error = check_acl(inode, mask);
+                        int error = check_acl(inode, mask, flags);
                        if (error != -EAGAIN)
                                return error;
                }
@@ -198,25 +198,30 @@ static int acl_permission_check(struct inode *inode, int mask,
 }
 /**
- * generic_permission  -  check for access rights on a Posix-like filesystem
+ * generic_permission -  check for access rights on a Posix-like filesystem
 * @inode:      inode to check access rights for
 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 * @check_acl:  optional callback to check for Posix ACLs
+ * @flags:      IPERM_FLAG_ flags.
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
- * are used for other things..
+ * are used for other things.
+ *
+ * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
+ * request cannot be satisfied (eg. requires blocking or too much complexity).
+ * It would then be called again in ref-walk mode.
 */
-int generic_permission(struct inode *inode, int mask,
+int generic_permission(struct inode *inode, int mask, unsigned int flags,
-                int (*check_acl)(struct inode *inode, int mask))
+        int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
        int ret;
        /*
         * Do the basic POSIX ACL permission checks.
         */
-        ret = acl_permission_check(inode, mask, check_acl);
+        ret = acl_permission_check(inode, mask, flags, check_acl);
        if (ret != -EACCES)
                return ret;
@@ -271,9 +276,10 @@ int inode_permission(struct inode *inode, int mask)
        }
        if (inode->i_op->permission)
-                retval = inode->i_op->permission(inode, mask);
+                retval = inode->i_op->permission(inode, mask, 0);
        else
-                retval = generic_permission(inode, mask, inode->i_op->check_acl);
+                retval = generic_permission(inode, mask, 0,
+                                inode->i_op->check_acl);
        if (retval)
                return retval;
@@ -375,6 +381,181 @@ void path_put(struct path *path)
 EXPORT_SYMBOL(path_put);
 /**
+ * nameidata_drop_rcu - drop this nameidata out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * Path walking has 2 modes, rcu-walk and ref-walk (see
+ * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
+ * to drop out of rcu-walk mode and take normal reference counts on dentries
+ * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
+ * refcounts at the last known good point before rcu-walk got stuck, so
+ * ref-walk may continue from there. If this is not successful (eg. a seqcount
+ * has changed), then failure is returned and path walk restarts from the
+ * beginning in ref-walk mode.
+ *
+ * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
+ * ref-walk. Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu(struct nameidata *nd)
+{
+        struct fs_struct *fs = current->fs;
+        struct dentry *dentry = nd->path.dentry;
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        if (nd->root.mnt) {
+                spin_lock(&fs->lock);
+                if (nd->root.mnt != fs->root.mnt ||
+                                nd->root.dentry != fs->root.dentry)
+                        goto err_root;
+        }
+        spin_lock(&dentry->d_lock);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err;
+        BUG_ON(nd->inode != dentry->d_inode);
+        spin_unlock(&dentry->d_lock);
+        if (nd->root.mnt) {
+                path_get(&nd->root);
+                spin_unlock(&fs->lock);
+        }
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        nd->flags &= ~LOOKUP_RCU;
+        return 0;
+err:
+        spin_unlock(&dentry->d_lock);
+err_root:
+        if (nd->root.mnt)
+                spin_unlock(&fs->lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return nameidata_drop_rcu(nd);
+        return 0;
+}
+/**
+ * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @dentry: dentry to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
+ * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
+ * @nd. Must be called from rcu-walk context.
+ */
+static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+{
+        struct fs_struct *fs = current->fs;
+        struct dentry *parent = nd->path.dentry;
+        /*
+         * It can be possible to revalidate the dentry that we started
+         * the path walk with. force_reval_path may also revalidate the
+         * dentry already committed to the nameidata.
+         */
+        if (unlikely(parent == dentry))
+                return nameidata_drop_rcu(nd);
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        if (nd->root.mnt) {
+                spin_lock(&fs->lock);
+                if (nd->root.mnt != fs->root.mnt ||
+                                nd->root.dentry != fs->root.dentry)
+                        goto err_root;
+        }
+        spin_lock(&parent->d_lock);
+        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err;
+        /*
+         * If the sequence check on the child dentry passed, then the child has
+         * not been removed from its parent. This means the parent dentry must
+         * be valid and able to take a reference at this point.
+         */
+        BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+        BUG_ON(!parent->d_count);
+        parent->d_count++;
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
+        if (nd->root.mnt) {
+                path_get(&nd->root);
+                spin_unlock(&fs->lock);
+        }
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        nd->flags &= ~LOOKUP_RCU;
+        return 0;
+err:
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
+err_root:
+        if (nd->root.mnt)
+                spin_unlock(&fs->lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return nameidata_dentry_drop_rcu(nd, dentry);
+        return 0;
+}
+/**
+ * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
+ * nd->path should be the final element of the lookup, so nd->root is discarded.
+ * Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu_last(struct nameidata *nd)
+{
+        struct dentry *dentry = nd->path.dentry;
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        nd->flags &= ~LOOKUP_RCU;
+        nd->root.mnt = NULL;
+        spin_lock(&dentry->d_lock);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err_unlock;
+        BUG_ON(nd->inode != dentry->d_inode);
+        spin_unlock(&dentry->d_lock);
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return 0;
+err_unlock:
+        spin_unlock(&dentry->d_lock);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
+{
+        if (likely(nd->flags & LOOKUP_RCU))
+                return nameidata_drop_rcu_last(nd);
+        return 0;
+}
+/**
 * release_open_intent - free up open intent resources
 * @nd: pointer to nameidata
 */
@@ -386,10 +567,33 @@ void release_open_intent(struct nameidata *nd)
                fput(nd->intent.open.file);
 }
+/*
+ * Call d_revalidate and handle filesystems that request rcu-walk
+ * to be dropped. This may be called and return in rcu-walk mode,
+ * regardless of success or error. If -ECHILD is returned, the caller
+ * must return -ECHILD back up the path walk stack so path walk may
+ * be restarted in ref-walk mode.
+ */
+static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        int status;
+        status = dentry->d_op->d_revalidate(dentry, nd);
+        if (status == -ECHILD) {
+                if (nameidata_dentry_drop_rcu(nd, dentry))
+                        return status;
+                status = dentry->d_op->d_revalidate(dentry, nd);
+        }
+        return status;
+}
 static inline struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        int status = dentry->d_op->d_revalidate(dentry, nd);
+        int status;
+        status = d_revalidate(dentry, nd);
        if (unlikely(status <= 0)) {
                /*
                 * The dentry failed validation.
@@ -397,19 +601,36 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
                 * the dentry otherwise d_revalidate is asking us
                 * to return a fail status.
                 */
-                if (!status) {
+                if (status < 0) {
+                        /* If we're in rcu-walk, we don't have a ref */
+                        if (!(nd->flags & LOOKUP_RCU))
+                                dput(dentry);
+                        dentry = ERR_PTR(status);
+                } else {
+                        /* Don't d_invalidate in rcu-walk mode */
+                        if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
+                                return ERR_PTR(-ECHILD);
                        if (!d_invalidate(dentry)) {
                                dput(dentry);
                                dentry = NULL;
                        }
-                } else {
-                        dput(dentry);
-                        dentry = ERR_PTR(status);
                }
        }
        return dentry;
 }
+static inline int need_reval_dot(struct dentry *dentry)
+{
+        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
+                return 0;
+        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
+                return 0;
+        return 1;
+}
 /*
 * force_reval_path - force revalidation of a dentry
 *
@@ -433,17 +654,19 @@ force_reval_path(struct path *path, struct nameidata *nd)
        /*
         * only check on filesystems where it's possible for the dentry to
-         * become stale. It's assumed that if this flag is set then the
+         * become stale.
-         * d_revalidate op will also be defined.
         */
-        if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))
+        if (!need_reval_dot(dentry))
                return 0;
-        status = dentry->d_op->d_revalidate(dentry, nd);
+        status = d_revalidate(dentry, nd);
        if (status > 0)
                return 0;
        if (!status) {
+                /* Don't d_invalidate in rcu-walk mode */
+                if (nameidata_drop_rcu(nd))
+                        return -ECHILD;
                d_invalidate(dentry);
                status = -ESTALE;
        }
@@ -459,26 +682,27 @@ force_reval_path(struct path *path, struct nameidata *nd)
 * short-cut DAC fails, then call ->permission() to do more
 * complete permission check.
 */
-static int exec_permission(struct inode *inode)
+static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
        int ret;
        if (inode->i_op->permission) {
-                ret = inode->i_op->permission(inode, MAY_EXEC);
+                ret = inode->i_op->permission(inode, MAY_EXEC, flags);
-                if (!ret)
+        } else {
-                        goto ok;
+                ret = acl_permission_check(inode, MAY_EXEC, flags,
-                return ret;
+                                inode->i_op->check_acl);
        }
-        ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+        if (likely(!ret))
-        if (!ret)
                goto ok;
+        if (ret == -ECHILD)
+                return ret;
        if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
                goto ok;
        return ret;
 ok:
-        return security_inode_permission(inode, MAY_EXEC);
+        return security_inode_exec_permission(inode, flags);
 }
 static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +713,23 @@ static __always_inline void set_root(struct nameidata *nd)
 static int link_path_walk(const char *, struct nameidata *);
+static __always_inline void set_root_rcu(struct nameidata *nd)
+{
+        if (!nd->root.mnt) {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->root = fs->root;
+                } while (read_seqcount_retry(&fs->seq, seq));
+        }
+}
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
+        int ret;
        if (IS_ERR(link))
                goto fail;
@@ -500,8 +739,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                nd->path = nd->root;
                path_get(&nd->root);
        }
+        nd->inode = nd->path.dentry->d_inode;
-        return link_path_walk(link, nd);
+        ret = link_path_walk(link, nd);
+        return ret;
 fail:
        path_put(&nd->path);
        return PTR_ERR(link);
@@ -514,30 +755,30 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
                mntput(path->mnt);
 }
-static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
+static inline void path_to_nameidata(const struct path *path,
+                                        struct nameidata *nd)
 {
-        dput(nd->path.dentry);
+        if (!(nd->flags & LOOKUP_RCU)) {
-        if (nd->path.mnt != path->mnt) {
+                dput(nd->path.dentry);
-                mntput(nd->path.mnt);
+                if (nd->path.mnt != path->mnt)
-                nd->path.mnt = path->mnt;
+                        mntput(nd->path.mnt);
        }
+        nd->path.mnt = path->mnt;
        nd->path.dentry = path->dentry;
 }
 static __always_inline int
-__do_follow_link(struct path *path, struct nameidata *nd, void **p)
+__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 {
        int error;
-        struct dentry *dentry = path->dentry;
+        struct dentry *dentry = link->dentry;
-        touch_atime(path->mnt, dentry);
+        touch_atime(link->mnt, dentry);
        nd_set_link(nd, NULL);
-        if (path->mnt != nd->path.mnt) {
+        if (link->mnt == nd->path.mnt)
-                path_to_nameidata(path, nd);
+                mntget(link->mnt);
-                dget(dentry);
-        }
-        mntget(path->mnt);
        nd->last_type = LAST_BIND;
        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
        error = PTR_ERR(*p);
@@ -591,6 +832,20 @@ loop:
        return err;
 }
+static int follow_up_rcu(struct path *path)
+{
+        struct vfsmount *parent;
+        struct dentry *mountpoint;
+        parent = path->mnt->mnt_parent;
+        if (parent == path->mnt)
+                return 0;
+        mountpoint = path->mnt->mnt_mountpoint;
+        path->dentry = mountpoint;
+        path->mnt = parent;
+        return 1;
+}
 int follow_up(struct path *path)
 {
        struct vfsmount *parent;
@@ -612,58 +867,295 @@ int follow_up(struct path *path)
        return 1;
 }
-/* no need for dcache_lock, as serialization is taken care in
+/*
- * namespace.c
+ * Perform an automount
+ * - return -EISDIR to tell follow_managed() to stop and return the path we
+ *   were called with.
 */
-static int __follow_mount(struct path *path)
+static int follow_automount(struct path *path, unsigned flags,
+                            bool *need_mntput)
 {
-        int res = 0;
+        struct vfsmount *mnt;
-        while (d_mountpoint(path->dentry)) {
+        int err;
-                struct vfsmount *mounted = lookup_mnt(path);
-                if (!mounted)
+        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
-                        break;
+                return -EREMOTE;
+        /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
+         * and this is the terminal part of the path.
+         */
+        if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
+                return -EISDIR; /* we actually want to stop here */
+        /* We want to mount if someone is trying to open/create a file of any
+         * type under the mountpoint, wants to traverse through the mountpoint
+         * or wants to open the mounted directory.
+         *
+         * We don't want to mount if someone's just doing a stat and they've
+         * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
+         * appended a '/' to the name.
+         */
+        if (!(flags & LOOKUP_FOLLOW) &&
+            !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
+                       LOOKUP_OPEN | LOOKUP_CREATE)))
+                return -EISDIR;
+        current->total_link_count++;
+        if (current->total_link_count >= 40)
+                return -ELOOP;
+        mnt = path->dentry->d_op->d_automount(path);
+        if (IS_ERR(mnt)) {
+                /*
+                 * The filesystem is allowed to return -EISDIR here to indicate
+                 * it doesn't want to automount.  For instance, autofs would do
+                 * this so that its userspace daemon can mount on this dentry.
+                 *
+                 * However, we can only permit this if it's a terminal point in
+                 * the path being looked up; if it wasn't then the remainder of
+                 * the path is inaccessible and we should say so.
+                 */
+                if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+                        return -EREMOTE;
+                return PTR_ERR(mnt);
+        }
+        if (!mnt) /* mount collision */
+                return 0;
+        err = finish_automount(mnt, path);
+        switch (err) {
+        case -EBUSY:
+                /* Someone else made a mount here whilst we were busy */
+                return 0;
+        case 0:
                dput(path->dentry);
-                if (res)
+                if (*need_mntput)
                        mntput(path->mnt);
+                path->mnt = mnt;
+                path->dentry = dget(mnt->mnt_root);
+                *need_mntput = true;
+                return 0;
+        default:
+                return err;
+        }
+}
+/*
+ * Handle a dentry that is managed in some way.
+ * - Flagged for transit management (autofs)
+ * - Flagged as mountpoint
+ * - Flagged as automount point
+ *
+ * This may only be called in refwalk mode.
+ *
+ * Serialization is taken care of in namespace.c
+ */
+static int follow_managed(struct path *path, unsigned flags)
+{
+        unsigned managed;
+        bool need_mntput = false;
+        int ret;
+        /* Given that we're not holding a lock here, we retain the value in a
+         * local variable for each dentry as we look at it so that we don't see
+         * the components of that value change under us */
+        while (managed = ACCESS_ONCE(path->dentry->d_flags),
+               managed &= DCACHE_MANAGED_DENTRY,
+               unlikely(managed != 0)) {
+                /* Allow the filesystem to manage the transit without i_mutex
+                 * being held. */
+                if (managed & DCACHE_MANAGE_TRANSIT) {
+                        BUG_ON(!path->dentry->d_op);
+                        BUG_ON(!path->dentry->d_op->d_manage);
+                        ret = path->dentry->d_op->d_manage(path->dentry,
+                                                           false, false);
+                        if (ret < 0)
+                                return ret == -EISDIR ? 0 : ret;
+                }
+                /* Transit to a mounted filesystem. */
+                if (managed & DCACHE_MOUNTED) {
+                        struct vfsmount *mounted = lookup_mnt(path);
+                        if (mounted) {
+                                dput(path->dentry);
+                                if (need_mntput)
+                                        mntput(path->mnt);
+                                path->mnt = mounted;
+                                path->dentry = dget(mounted->mnt_root);
+                                need_mntput = true;
+                                continue;
+                        }
+                        /* Something is mounted on this dentry in another
+                         * namespace and/or whatever was mounted there in this
+                         * namespace got unmounted before we managed to get the
+                         * vfsmount_lock */
+                }
+                /* Handle an automount point */
+                if (managed & DCACHE_NEED_AUTOMOUNT) {
+                        ret = follow_automount(path, flags, &need_mntput);
+                        if (ret < 0)
+                                return ret == -EISDIR ? 0 : ret;
+                        continue;
+                }
+                /* We didn't change the current path point */
+                break;
+        }
+        return 0;
+}
+int follow_down_one(struct path *path)
+{
+        struct vfsmount *mounted;
+        mounted = lookup_mnt(path);
+        if (mounted) {
+                dput(path->dentry);
+                mntput(path->mnt);
                path->mnt = mounted;
                path->dentry = dget(mounted->mnt_root);
-                res = 1;
+                return 1;
        }
-        return res;
+        return 0;
 }
-static void follow_mount(struct path *path)
+/*
+ * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
+ * meet a managed dentry and we're not walking to "..".  True is returned to
+ * continue, false to abort.
+ */
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+                               struct inode **inode, bool reverse_transit)
 {
        while (d_mountpoint(path->dentry)) {
-                struct vfsmount *mounted = lookup_mnt(path);
+                struct vfsmount *mounted;
+                if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+                    !reverse_transit &&
+                    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+                        return false;
+                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
                if (!mounted)
                        break;
-                dput(path->dentry);
-                mntput(path->mnt);
                path->mnt = mounted;
-                path->dentry = dget(mounted->mnt_root);
+                path->dentry = mounted->mnt_root;
+                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+                *inode = path->dentry->d_inode;
+        }
+        if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                return reverse_transit;
+        return true;
+}
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
+        struct inode *inode = nd->inode;
+        set_root_rcu(nd);
+        while (1) {
+                if (nd->path.dentry == nd->root.dentry &&
+                    nd->path.mnt == nd->root.mnt) {
+                        break;
+                }
+                if (nd->path.dentry != nd->path.mnt->mnt_root) {
+                        struct dentry *old = nd->path.dentry;
+                        struct dentry *parent = old->d_parent;
+                        unsigned seq;
+                        seq = read_seqcount_begin(&parent->d_seq);
+                        if (read_seqcount_retry(&old->d_seq, nd->seq))
+                                return -ECHILD;
+                        inode = parent->d_inode;
+                        nd->path.dentry = parent;
+                        nd->seq = seq;
+                        break;
+                }
+                if (!follow_up_rcu(&nd->path))
+                        break;
+                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+                inode = nd->path.dentry->d_inode;
        }
+        __follow_mount_rcu(nd, &nd->path, &inode, true);
+        nd->inode = inode;
+        return 0;
 }
-/* no need for dcache_lock, as serialization is taken care in
+/*
- * namespace.c
+ * Follow down to the covering mount currently visible to userspace.  At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ *
+ * Care must be taken as namespace_sem may be held (indicated by mounting_here
+ * being true).
 */
-int follow_down(struct path *path)
+int follow_down(struct path *path, bool mounting_here)
 {
-        struct vfsmount *mounted;
+        unsigned managed;
+        int ret;
-        mounted = lookup_mnt(path);
+        while (managed = ACCESS_ONCE(path->dentry->d_flags),
-        if (mounted) {
+               unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+                /* Allow the filesystem to manage the transit without i_mutex
+                 * being held.
+                 *
+                 * We indicate to the filesystem if someone is trying to mount
+                 * something here.  This gives autofs the chance to deny anyone
+                 * other than its daemon the right to mount on its
+                 * superstructure.
+                 *
+                 * The filesystem may sleep at this point.
+                 */
+                if (managed & DCACHE_MANAGE_TRANSIT) {
+                        BUG_ON(!path->dentry->d_op);
+                        BUG_ON(!path->dentry->d_op->d_manage);
+                        ret = path->dentry->d_op->d_manage(
+                                path->dentry, mounting_here, false);
+                        if (ret < 0)
+                                return ret == -EISDIR ? 0 : ret;
+                }
+                /* Transit to a mounted filesystem. */
+                if (managed & DCACHE_MOUNTED) {
+                        struct vfsmount *mounted = lookup_mnt(path);
+                        if (!mounted)
+                                break;
+                        dput(path->dentry);
+                        mntput(path->mnt);
+                        path->mnt = mounted;
+                        path->dentry = dget(mounted->mnt_root);
+                        continue;
+                }
+                /* Don't handle automount points here */
+                break;
+        }
+        return 0;
+}
+/*
+ * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+ */
+static void follow_mount(struct path *path)
+{
+        while (d_mountpoint(path->dentry)) {
+                struct vfsmount *mounted = lookup_mnt(path);
+                if (!mounted)
+                        break;
                dput(path->dentry);
                mntput(path->mnt);
                path->mnt = mounted;
                path->dentry = dget(mounted->mnt_root);
-                return 1;
        }
-        return 0;
 }
-static __always_inline void follow_dotdot(struct nameidata *nd)
+static void follow_dotdot(struct nameidata *nd)
 {
        set_root(nd);
@@ -684,6 +1176,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
                        break;
        }
        follow_mount(&nd->path);
+        nd->inode = nd->path.dentry->d_inode;
 }
 /*
@@ -721,17 +1214,19 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
 *  It _is_ time-critical.
 */
 static int do_lookup(struct nameidata *nd, struct qstr *name,
-                     struct path *path)
+                        struct path *path, struct inode **inode)
 {
        struct vfsmount *mnt = nd->path.mnt;
-        struct dentry *dentry, *parent;
+        struct dentry *dentry, *parent = nd->path.dentry;
        struct inode *dir;
+        int err;
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
-        if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
+        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-                int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
+                err = parent->d_op->d_hash(parent, nd->inode, name);
                if (err < 0)
                        return err;
        }
@@ -741,21 +1236,52 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
         * of a false negative due to a concurrent rename, we're going to
         * do the non-racy lookup, below.
         */
-        dentry = __d_lookup(nd->path.dentry, name);
+        if (nd->flags & LOOKUP_RCU) {
+                unsigned seq;
+                *inode = nd->inode;
+                dentry = __d_lookup_rcu(parent, name, &seq, inode);
+                if (!dentry) {
+                        if (nameidata_drop_rcu(nd))
+                                return -ECHILD;
+                        goto need_lookup;
+                }
+                /* Memory barrier in read_seqcount_begin of child is enough */
+                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+                        return -ECHILD;
+                nd->seq = seq;
+                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+                        goto need_revalidate;
+done2:
+                path->mnt = mnt;
+                path->dentry = dentry;
+                if (likely(__follow_mount_rcu(nd, path, inode, false)))
+                        return 0;
+                if (nameidata_drop_rcu(nd))
+                        return -ECHILD;
+                /* fallthru */
+        }
+        dentry = __d_lookup(parent, name);
        if (!dentry)
                goto need_lookup;
 found:
-        if (dentry->d_op && dentry->d_op->d_revalidate)
+        if (dentry->d_flags & DCACHE_OP_REVALIDATE)
                goto need_revalidate;
 done:
        path->mnt = mnt;
        path->dentry = dentry;
-        __follow_mount(path);
+        err = follow_managed(path, nd->flags);
+        if (unlikely(err < 0)) {
+                path_put_conditional(path, nd);
+                return err;
+        }
+        *inode = path->dentry->d_inode;
        return 0;
 need_lookup:
-        parent = nd->path.dentry;
        dir = parent->d_inode;
+        BUG_ON(nd->inode != dir);
        mutex_lock(&dir->i_mutex);
        /*
@@ -789,6 +1315,8 @@ need_revalidate:
                goto need_lookup;
        if (IS_ERR(dentry))
                goto fail;
+        if (nd->flags & LOOKUP_RCU)
+                goto done2;
        goto done;
 fail:
@@ -796,17 +1324,6 @@ fail:
 }
 /*
- * This is a temporary kludge to deal with "automount" symlinks; proper
- * solution is to trigger them on follow_mount(), so that do_lookup()
- * would DTRT.  To be killed before 2.6.34-final.
- */
-static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
-{
-        return inode && unlikely(inode->i_op->follow_link) &&
-                ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
-}
-/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
@@ -817,7 +1334,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
 static int link_path_walk(const char *name, struct nameidata *nd)
 {
        struct path next;
-        struct inode *inode;
        int err;
        unsigned int lookup_flags = nd->flags;
        
@@ -826,18 +1342,28 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        if (!*name)
                goto return_reval;
-        inode = nd->path.dentry->d_inode;
        if (nd->depth)
                lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
        /* At this point we know we have a real path component. */
        for(;;) {
+                struct inode *inode;
                unsigned long hash;
                struct qstr this;
                unsigned int c;
                nd->flags |= LOOKUP_CONTINUE;
-                err = exec_permission(inode);
+                if (nd->flags & LOOKUP_RCU) {
+                        err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                        if (err == -ECHILD) {
+                                if (nameidata_drop_rcu(nd))
+                                        return -ECHILD;
+                                goto exec_again;
+                        }
+                } else {
+exec_again:
+                        err = exec_permission(nd->inode, 0);
+                }
                if (err)
                        break;
@@ -868,37 +1394,44 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                if (this.name[0] == '.') switch (this.len) {
                        default:
                                break;
-                        case 2: 
+                        case 2:
                                if (this.name[1] != '.')
                                        break;
-                                follow_dotdot(nd);
+                                if (nd->flags & LOOKUP_RCU) {
-                                inode = nd->path.dentry->d_inode;
+                                        if (follow_dotdot_rcu(nd))
+                                                return -ECHILD;
+                                } else
+                                        follow_dotdot(nd);
                                /* fallthrough */
                        case 1:
                                continue;
                }
                /* This does the actual lookups.. */
-                err = do_lookup(nd, &this, &next);
+                err = do_lookup(nd, &this, &next, &inode);
                if (err)
                        break;
                err = -ENOENT;
-                inode = next.dentry->d_inode;
                if (!inode)
                        goto out_dput;
                if (inode->i_op->follow_link) {
+                        /* We commonly drop rcu-walk here */
+                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+                                return -ECHILD;
+                        BUG_ON(inode != next.dentry->d_inode);
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
+                        nd->inode = nd->path.dentry->d_inode;
                        err = -ENOENT;
-                        inode = nd->path.dentry->d_inode;
+                        if (!nd->inode)
-                        if (!inode)
                                break;
-                } else
+                } else {
                        path_to_nameidata(&next, nd);
+                        nd->inode = inode;
+                }
                err = -ENOTDIR; 
-                if (!inode->i_op->lookup)
+                if (!nd->inode->i_op->lookup)
                        break;
                continue;
                /* here ends the main loop */
@@ -913,32 +1446,40 @@ last_component:
                if (this.name[0] == '.') switch (this.len) {
                        default:
                                break;
-                        case 2: 
+                        case 2:
                                if (this.name[1] != '.')
                                        break;
-                                follow_dotdot(nd);
+                                if (nd->flags & LOOKUP_RCU) {
-                                inode = nd->path.dentry->d_inode;
+                                        if (follow_dotdot_rcu(nd))
+                                                return -ECHILD;
+                                } else
+                                        follow_dotdot(nd);
                                /* fallthrough */
                        case 1:
                                goto return_reval;
                }
-                err = do_lookup(nd, &this, &next);
+                err = do_lookup(nd, &this, &next, &inode);
                if (err)
                        break;
-                inode = next.dentry->d_inode;
+                if (inode && unlikely(inode->i_op->follow_link) &&
-                if (follow_on_final(inode, lookup_flags)) {
+                    (lookup_flags & LOOKUP_FOLLOW)) {
+                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+                                return -ECHILD;
+                        BUG_ON(inode != next.dentry->d_inode);
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
-                        inode = nd->path.dentry->d_inode;
+                        nd->inode = nd->path.dentry->d_inode;
-                } else
+                } else {
                        path_to_nameidata(&next, nd);
+                        nd->inode = inode;
+                }
                err = -ENOENT;
-                if (!inode)
+                if (!nd->inode)
                        break;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
-                        if (!inode->i_op->lookup)
+                        if (!nd->inode->i_op->lookup)
                                break;
                }
                goto return_base;
@@ -958,25 +1499,43 @@ return_reval:
                 * We bypassed the ordinary revalidation routines.
                 * We may need to check the cached dentry for staleness.
                 */
-                if (nd->path.dentry && nd->path.dentry->d_sb &&
+                if (need_reval_dot(nd->path.dentry)) {
-                    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
-                        err = -ESTALE;
                        /* Note: we do not d_invalidate() */
-                        if (!nd->path.dentry->d_op->d_revalidate(
+                        err = d_revalidate(nd->path.dentry, nd);
-                                        nd->path.dentry, nd))
+                        if (!err)
+                                err = -ESTALE;
+                        if (err < 0)
                                break;
                }
 return_base:
+                if (nameidata_drop_rcu_last_maybe(nd))
+                        return -ECHILD;
                return 0;
 out_dput:
-                path_put_conditional(&next, nd);
+                if (!(nd->flags & LOOKUP_RCU))
+                        path_put_conditional(&next, nd);
                break;
        }
-        path_put(&nd->path);
+        if (!(nd->flags & LOOKUP_RCU))
+                path_put(&nd->path);
 return_err:
        return err;
 }
+static inline int path_walk_rcu(const char *name, struct nameidata *nd)
+{
+        current->total_link_count = 0;
+        return link_path_walk(name, nd);
+}
+static inline int path_walk_simple(const char *name, struct nameidata *nd)
+{
+        current->total_link_count = 0;
+        return link_path_walk(name, nd);
+}
 static int path_walk(const char *name, struct nameidata *nd)
 {
        struct path save = nd->path;
@@ -1002,6 +1561,93 @@ static int path_walk(const char *name, struct nameidata *nd)
        return result;
 }
+static void path_finish_rcu(struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU) {
+                /* RCU dangling. Cancel it. */
+                nd->flags &= ~LOOKUP_RCU;
+                nd->root.mnt = NULL;
+                rcu_read_unlock();
+                br_read_unlock(vfsmount_lock);
+        }
+        if (nd->file)
+                fput(nd->file);
+}
+static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+{
+        int retval = 0;
+        int fput_needed;
+        struct file *file;
+        nd->last_type = LAST_ROOT; /* if there are only slashes... */
+        nd->flags = flags | LOOKUP_RCU;
+        nd->depth = 0;
+        nd->root.mnt = NULL;
+        nd->file = NULL;
+        if (*name=='/') {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->root = fs->root;
+                        nd->path = nd->root;
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } while (read_seqcount_retry(&fs->seq, seq));
+        } else if (dfd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->path = fs->pwd;
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } while (read_seqcount_retry(&fs->seq, seq));
+        } else {
+                struct dentry *dentry;
+                file = fget_light(dfd, &fput_needed);
+                retval = -EBADF;
+                if (!file)
+                        goto out_fail;
+                dentry = file->f_path.dentry;
+                retval = -ENOTDIR;
+                if (!S_ISDIR(dentry->d_inode->i_mode))
+                        goto fput_fail;
+                retval = file_permission(file, MAY_EXEC);
+                if (retval)
+                        goto fput_fail;
+                nd->path = file->f_path;
+                if (fput_needed)
+                        nd->file = file;
+                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+        }
+        nd->inode = nd->path.dentry->d_inode;
+        return 0;
+fput_fail:
+        fput_light(file, fput_needed);
+out_fail:
+        return retval;
+}
 static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
@@ -1042,6 +1688,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
                fput_light(file, fput_needed);
        }
+        nd->inode = nd->path.dentry->d_inode;
        return 0;
 fput_fail:
@@ -1054,16 +1701,53 @@ out_fail:
 static int do_path_lookup(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval = path_init(dfd, name, flags, nd);
+        int retval;
-        if (!retval)
-                retval = path_walk(name, nd);
+        /*
-        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
+         * Path walking is largely split up into 2 different synchronisation
-                                nd->path.dentry->d_inode))
+         * schemes, rcu-walk and ref-walk (explained in
-                audit_inode(name, nd->path.dentry);
+         * Documentation/filesystems/path-lookup.txt). These share much of the
+         * path walk code, but some things particularly setup, cleanup, and
+         * following mounts are sufficiently divergent that functions are
+         * duplicated. Typically there is a function foo(), and its RCU
+         * analogue, foo_rcu().
+         *
+         * -ECHILD is the error number of choice (just to avoid clashes) that
+         * is returned if some aspect of an rcu-walk fails. Such an error must
+         * be handled by restarting a traditional ref-walk (which will always
+         * be able to complete).
+         */
+        retval = path_init_rcu(dfd, name, flags, nd);
+        if (unlikely(retval))
+                return retval;
+        retval = path_walk_rcu(name, nd);
+        path_finish_rcu(nd);
        if (nd->root.mnt) {
                path_put(&nd->root);
                nd->root.mnt = NULL;
        }
+        if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
+                /* slower, locked walk */
+                if (retval == -ESTALE)
+                        flags |= LOOKUP_REVAL;
+                retval = path_init(dfd, name, flags, nd);
+                if (unlikely(retval))
+                        return retval;
+                retval = path_walk(name, nd);
+                if (nd->root.mnt) {
+                        path_put(&nd->root);
+                        nd->root.mnt = NULL;
+                }
+        }
+        if (likely(!retval)) {
+                if (unlikely(!audit_dummy_context())) {
+                        if (nd->path.dentry && nd->inode)
+                                audit_inode(name, nd->path.dentry);
+                }
+        }
        return retval;
 }
@@ -1106,10 +1790,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
        path_get(&nd->path);
        nd->root = nd->path;
        path_get(&nd->root);
+        nd->inode = nd->path.dentry->d_inode;
        retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-                                nd->path.dentry->d_inode))
+                                nd->inode))
                audit_inode(name, nd->path.dentry);
        path_put(&nd->root);
@@ -1121,18 +1806,20 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 static struct dentry *__lookup_hash(struct qstr *name,
                struct dentry *base, struct nameidata *nd)
 {
+        struct inode *inode = base->d_inode;
        struct dentry *dentry;
-        struct inode *inode;
        int err;
-        inode = base->d_inode;
+        err = exec_permission(inode, 0);
+        if (err)
+                return ERR_PTR(err);
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
-        if (base->d_op && base->d_op->d_hash) {
+        if (base->d_flags & DCACHE_OP_HASH) {
-                err = base->d_op->d_hash(base, name);
+                err = base->d_op->d_hash(base, inode, name);
                dentry = ERR_PTR(err);
                if (err < 0)
                        goto out;
@@ -1145,7 +1832,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
         */
        dentry = d_lookup(base, name);
-        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+        if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
                dentry = do_revalidate(dentry, nd);
        if (!dentry)
@@ -1161,11 +1848,6 @@ out:
 */
 static struct dentry *lookup_hash(struct nameidata *nd)
 {
-        int err;
-        err = exec_permission(nd->path.dentry->d_inode);
-        if (err)
-                return ERR_PTR(err);
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
@@ -1213,9 +1895,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        if (err)
                return ERR_PTR(err);
-        err = exec_permission(base->d_inode);
-        if (err)
-                return ERR_PTR(err);
        return __lookup_hash(&this, base, NULL);
 }
@@ -1454,8 +2133,9 @@ int may_open(struct path *path, int acc_mode, int flag)
        return break_lease(inode, flag);
 }
-static int handle_truncate(struct path *path)
+static int handle_truncate(struct file *filp)
 {
+        struct path *path = &filp->f_path;
        struct inode *inode = path->dentry->d_inode;
        int error = get_write_access(inode);
        if (error)
@@ -1469,7 +2149,7 @@ static int handle_truncate(struct path *path)
        if (!error) {
                error = do_truncate(path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
-                                    NULL);
+                                    filp);
        }
        put_write_access(inode);
        return error;
@@ -1496,6 +2176,7 @@ out_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
        dput(nd->path.dentry);
        nd->path.dentry = path->dentry;
        if (error)
                return error;
        /* Don't check for write permission, don't truncate */
@@ -1566,7 +2247,7 @@ static struct file *finish_open(struct nameidata *nd,
        }
        if (!IS_ERR(filp)) {
                if (will_truncate) {
-                        error = handle_truncate(&nd->path);
+                        error = handle_truncate(filp);
                        if (error) {
                                fput(filp);
                                filp = ERR_PTR(error);
@@ -1580,6 +2261,7 @@ static struct file *finish_open(struct nameidata *nd,
         */
        if (will_truncate)
                mnt_drop_write(nd->path.mnt);
+        path_put(&nd->path);
        return filp;
 exit:
@@ -1589,6 +2271,9 @@ exit:
        return ERR_PTR(error);
 }
+/*
+ * Handle O_CREAT case for do_filp_open
+ */
 static struct file *do_last(struct nameidata *nd, struct path *path,
                            int open_flag, int acc_mode,
                            int mode, const char *pathname)
@@ -1602,50 +2287,27 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                follow_dotdot(nd);
                dir = nd->path.dentry;
        case LAST_DOT:
-                if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
+                if (need_reval_dot(dir)) {
-                        if (!dir->d_op->d_revalidate(dir, nd)) {
+                        int status = d_revalidate(nd->path.dentry, nd);
-                                error = -ESTALE;
+                        if (!status)
+                                status = -ESTALE;
+                        if (status < 0) {
+                                error = status;
                                goto exit;
                        }
                }
                /* fallthrough */
        case LAST_ROOT:
-                if (open_flag & O_CREAT)
+                goto exit;
-                        goto exit;
-                /* fallthrough */
        case LAST_BIND:
                audit_inode(pathname, dir);
                goto ok;
        }
        /* trailing slashes? */
-        if (nd->last.name[nd->last.len]) {
+        if (nd->last.name[nd->last.len])
-                if (open_flag & O_CREAT)
+                goto exit;
-                        goto exit;
-                nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
-        }
-        /* just plain open? */
-        if (!(open_flag & O_CREAT)) {
-                error = do_lookup(nd, &nd->last, path);
-                if (error)
-                        goto exit;
-                error = -ENOENT;
-                if (!path->dentry->d_inode)
-                        goto exit_dput;
-                if (path->dentry->d_inode->i_op->follow_link)
-                        return NULL;
-                error = -ENOTDIR;
-                if (nd->flags & LOOKUP_DIRECTORY) {
-                        if (!path->dentry->d_inode->i_op->lookup)
-                                goto exit_dput;
-                }
-                path_to_nameidata(path, nd);
-                audit_inode(pathname, nd->path.dentry);
-                goto ok;
-        }
-        /* OK, it's O_CREAT */
        mutex_lock(&dir->d_inode->i_mutex);
        path->dentry = lookup_hash(nd);
@@ -1681,6 +2343,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                }
                filp = nameidata_to_filp(nd);
                mnt_drop_write(nd->path.mnt);
+                path_put(&nd->path);
                if (!IS_ERR(filp)) {
                        error = ima_file_check(filp, acc_mode);
                        if (error) {
@@ -1701,11 +2364,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (open_flag & O_EXCL)
                goto exit_dput;
-        if (__follow_mount(path)) {
+        error = follow_managed(path, nd->flags);
-                error = -ELOOP;
+        if (error < 0)
-                if (open_flag & O_NOFOLLOW)
+                goto exit_dput;
-                        goto exit_dput;
-        }
        error = -ENOENT;
        if (!path->dentry->d_inode)
@@ -1715,8 +2376,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                return NULL;
        path_to_nameidata(path, nd);
+        nd->inode = path->dentry->d_inode;
        error = -EISDIR;
-        if (S_ISDIR(path->dentry->d_inode->i_mode))
+        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
 ok:
        filp = finish_open(nd, open_flag, acc_mode);
@@ -1747,11 +2409,14 @@ struct file *do_filp_open(int dfd, const char *pathname,
        struct path path;
        int count = 0;
        int flag = open_to_namei_flags(open_flag);
-        int force_reval = 0;
+        int flags;
        if (!(open_flag & O_CREAT))
                mode = 0;
+        /* Must never be set by userspace */
+        open_flag &= ~FMODE_NONOTIFY;
        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
         * check for O_DSYNC if the need any syncing at all we enforce it's
@@ -1773,54 +2438,84 @@ struct file *do_filp_open(int dfd, const char *pathname,
        if (open_flag & O_APPEND)
                acc_mode |= MAY_APPEND;
-        /* find the parent */
+        flags = LOOKUP_OPEN;
-reval:
+        if (open_flag & O_CREAT) {
-        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
+                flags |= LOOKUP_CREATE;
+                if (open_flag & O_EXCL)
+                        flags |= LOOKUP_EXCL;
+        }
+        if (open_flag & O_DIRECTORY)
+                flags |= LOOKUP_DIRECTORY;
+        if (!(open_flag & O_NOFOLLOW))
+                flags |= LOOKUP_FOLLOW;
+        filp = get_empty_filp();
+        if (!filp)
+                return ERR_PTR(-ENFILE);
+        filp->f_flags = open_flag;
+        nd.intent.open.file = filp;
+        nd.intent.open.flags = flag;
+        nd.intent.open.create_mode = mode;
+        if (open_flag & O_CREAT)
+                goto creat;
+        /* !O_CREAT, simple open */
+        error = do_path_lookup(dfd, pathname, flags, &nd);
+        if (unlikely(error))
+                goto out_filp;
+        error = -ELOOP;
+        if (!(nd.flags & LOOKUP_FOLLOW)) {
+                if (nd.inode->i_op->follow_link)
+                        goto out_path;
+        }
+        error = -ENOTDIR;
+        if (nd.flags & LOOKUP_DIRECTORY) {
+                if (!nd.inode->i_op->lookup)
+                        goto out_path;
+        }
+        audit_inode(pathname, nd.path.dentry);
+        filp = finish_open(&nd, open_flag, acc_mode);
+        return filp;
+creat:
+        /* OK, have to create the file. Find the parent. */
+        error = path_init_rcu(dfd, pathname,
+                        LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
        if (error)
-                return ERR_PTR(error);
+                goto out_filp;
-        if (force_reval)
+        error = path_walk_rcu(pathname, &nd);
-                nd.flags |= LOOKUP_REVAL;
+        path_finish_rcu(&nd);
+        if (unlikely(error == -ECHILD || error == -ESTALE)) {
+                /* slower, locked walk */
+                if (error == -ESTALE) {
+reval:
+                        flags |= LOOKUP_REVAL;
+                }
+                error = path_init(dfd, pathname,
+                                LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
+                if (error)
+                        goto out_filp;
-        current->total_link_count = 0;
+                error = path_walk_simple(pathname, &nd);
-        error = link_path_walk(pathname, &nd);
-        if (error) {
-                filp = ERR_PTR(error);
-                goto out;
        }
-        if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
+        if (unlikely(error))
+                goto out_filp;
+        if (unlikely(!audit_dummy_context()))
                audit_inode(pathname, nd.path.dentry);
        /*
         * We have the parent and last component.
         */
+        nd.flags = flags;
-        error = -ENFILE;
-        filp = get_empty_filp();
-        if (filp == NULL)
-                goto exit_parent;
-        nd.intent.open.file = filp;
-        filp->f_flags = open_flag;
-        nd.intent.open.flags = flag;
-        nd.intent.open.create_mode = mode;
-        nd.flags &= ~LOOKUP_PARENT;
-        nd.flags |= LOOKUP_OPEN;
-        if (open_flag & O_CREAT) {
-                nd.flags |= LOOKUP_CREATE;
-                if (open_flag & O_EXCL)
-                        nd.flags |= LOOKUP_EXCL;
-        }
-        if (open_flag & O_DIRECTORY)
-                nd.flags |= LOOKUP_DIRECTORY;
-        if (!(open_flag & O_NOFOLLOW))
-                nd.flags |= LOOKUP_FOLLOW;
        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
-                struct path holder;
+                struct path link = path;
-                struct inode *inode = path.dentry->d_inode;
+                struct inode *linki = link.dentry->d_inode;
                void *cookie;
                error = -ELOOP;
-                /* S_ISDIR part is a temporary automount kludge */
+                if (!(nd.flags & LOOKUP_FOLLOW))
-                if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
                        goto exit_dput;
                if (count++ == 32)
                        goto exit_dput;
@@ -1836,41 +2531,37 @@ reval:
                 * just set LAST_BIND.
                 */
                nd.flags |= LOOKUP_PARENT;
-                error = security_inode_follow_link(path.dentry, &nd);
+                error = security_inode_follow_link(link.dentry, &nd);
                if (error)
                        goto exit_dput;
-                error = __do_follow_link(&path, &nd, &cookie);
+                error = __do_follow_link(&link, &nd, &cookie);
                if (unlikely(error)) {
+                        if (!IS_ERR(cookie) && linki->i_op->put_link)
+                                linki->i_op->put_link(link.dentry, &nd, cookie);
                        /* nd.path had been dropped */
-                        if (!IS_ERR(cookie) && inode->i_op->put_link)
+                        nd.path = link;
-                                inode->i_op->put_link(path.dentry, &nd, cookie);
+                        goto out_path;
-                        path_put(&path);
-                        release_open_intent(&nd);
-                        filp = ERR_PTR(error);
-                        goto out;
                }
-                holder = path;
                nd.flags &= ~LOOKUP_PARENT;
                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-                if (inode->i_op->put_link)
+                if (linki->i_op->put_link)
-                        inode->i_op->put_link(holder.dentry, &nd, cookie);
+                        linki->i_op->put_link(link.dentry, &nd, cookie);
-                path_put(&holder);
+                path_put(&link);
        }
 out:
        if (nd.root.mnt)
                path_put(&nd.root);
-        if (filp == ERR_PTR(-ESTALE) && !force_reval) {
+        if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
-                force_reval = 1;
                goto reval;
-        }
        return filp;
 exit_dput:
        path_put_conditional(&path, &nd);
+out_path:
+        path_put(&nd.path);
+out_filp:
        if (!IS_ERR(nd.intent.open.file))
                release_open_intent(&nd);
-exit_parent:
-        path_put(&nd.path);
        filp = ERR_PTR(error);
        goto out;
 }
@@ -2131,12 +2822,10 @@ void dentry_unhash(struct dentry *dentry)
 {
        dget(dentry);
        shrink_dcache_parent(dentry);
-        spin_lock(&dcache_lock);
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count) == 2)
+        if (dentry->d_count == 2)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
 }
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -2291,7 +2980,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                        goto slashes;
                inode = dentry->d_inode;
                if (inode)
-                        atomic_inc(&inode->i_count);
+                        ihold(inode);
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
@@ -2885,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = {
 };
 EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(follow_down_one);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index a72eaabfe8f2..7b0b95371696 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,7 +13,6 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/percpu.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/acct.h>
@@ -139,6 +138,64 @@ void mnt_release_group_id(struct vfsmount *mnt)
        mnt->mnt_group_id = 0;
 }
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_add_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+        this_cpu_add(mnt->mnt_pcp->mnt_count, n);
+#else
+        preempt_disable();
+        mnt->mnt_count += n;
+        preempt_enable();
+#endif
+}
+static inline void mnt_set_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+        this_cpu_write(mnt->mnt_pcp->mnt_count, n);
+#else
+        mnt->mnt_count = n;
+#endif
+}
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_inc_count(struct vfsmount *mnt)
+{
+        mnt_add_count(mnt, 1);
+}
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_dec_count(struct vfsmount *mnt)
+{
+        mnt_add_count(mnt, -1);
+}
+/*
+ * vfsmount lock must be held for write
+ */
+unsigned int mnt_get_count(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        unsigned int count = 0;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
+        }
+        return count;
+#else
+        return mnt->mnt_count;
+#endif
+}
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
        struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -155,7 +212,17 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                                goto out_free_id;
                }
-                atomic_set(&mnt->mnt_count, 1);
+#ifdef CONFIG_SMP
+                mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+                if (!mnt->mnt_pcp)
+                        goto out_free_devname;
+                this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+#else
+                mnt->mnt_count = 1;
+                mnt->mnt_writers = 0;
+#endif
                INIT_LIST_HEAD(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -167,13 +234,6 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 #ifdef CONFIG_FSNOTIFY
                INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
-#ifdef CONFIG_SMP
-                mnt->mnt_writers = alloc_percpu(int);
-                if (!mnt->mnt_writers)
-                        goto out_free_devname;
-#else
-                mnt->mnt_writers = 0;
-#endif
        }
        return mnt;
@@ -217,32 +277,32 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-static inline void inc_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_inc_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+        this_cpu_inc(mnt->mnt_pcp->mnt_writers);
 #else
        mnt->mnt_writers++;
 #endif
 }
-static inline void dec_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_dec_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+        this_cpu_dec(mnt->mnt_pcp->mnt_writers);
 #else
        mnt->mnt_writers--;
 #endif
 }
-static unsigned int count_mnt_writers(struct vfsmount *mnt)
+static unsigned int mnt_get_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
        unsigned int count = 0;
        int cpu;
        for_each_possible_cpu(cpu) {
-                count += *per_cpu_ptr(mnt->mnt_writers, cpu);
+                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
        }
        return count;
@@ -274,9 +334,9 @@ int mnt_want_write(struct vfsmount *mnt)
        int ret = 0;
        preempt_disable();
-        inc_mnt_writers(mnt);
+        mnt_inc_writers(mnt);
        /*
-         * The store to inc_mnt_writers must be visible before we pass
+         * The store to mnt_inc_writers must be visible before we pass
         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set MNT_WRITE_HOLD.
         */
@@ -290,7 +350,7 @@ int mnt_want_write(struct vfsmount *mnt)
         */
        smp_rmb();
        if (__mnt_is_readonly(mnt)) {
-                dec_mnt_writers(mnt);
+                mnt_dec_writers(mnt);
                ret = -EROFS;
                goto out;
        }
@@ -318,7 +378,7 @@ int mnt_clone_write(struct vfsmount *mnt)
        if (__mnt_is_readonly(mnt))
                return -EROFS;
        preempt_disable();
-        inc_mnt_writers(mnt);
+        mnt_inc_writers(mnt);
        preempt_enable();
        return 0;
 }
@@ -352,7 +412,7 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file);
 void mnt_drop_write(struct vfsmount *mnt)
 {
        preempt_disable();
-        dec_mnt_writers(mnt);
+        mnt_dec_writers(mnt);
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -385,7 +445,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
         * we're counting up here.
         */
-        if (count_mnt_writers(mnt) > 0)
+        if (mnt_get_writers(mnt) > 0)
                ret = -EBUSY;
        else
                mnt->mnt_flags |= MNT_READONLY;
@@ -419,7 +479,7 @@ void free_vfsmnt(struct vfsmount *mnt)
        kfree(mnt->mnt_devname);
        mnt_free_id(mnt);
 #ifdef CONFIG_SMP
-        free_percpu(mnt->mnt_writers);
+        free_percpu(mnt->mnt_pcp);
 #endif
        kmem_cache_free(mnt_cache, mnt);
 }
@@ -493,6 +553,27 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 }
 /*
+ * Clear dentry's mounted state if it has no remaining mounts.
+ * vfsmount_lock must be held for write.
+ */
+static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
+{
+        unsigned u;
+        for (u = 0; u < HASH_SIZE; u++) {
+                struct vfsmount *p;
+                list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
+                        if (p->mnt_mountpoint == dentry)
+                                return;
+                }
+        }
+        spin_lock(&dentry->d_lock);
+        dentry->d_flags &= ~DCACHE_MOUNTED;
+        spin_unlock(&dentry->d_lock);
+}
+/*
 * vfsmount lock must be held for write
 */
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
@@ -503,7 +584,7 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
        mnt->mnt_mountpoint = mnt->mnt_root;
        list_del_init(&mnt->mnt_child);
        list_del_init(&mnt->mnt_hash);
-        old_path->dentry->d_mounted--;
+        dentry_reset_mounted(old_path->mnt, old_path->dentry);
 }
 /*
@@ -514,7 +595,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
 {
        child_mnt->mnt_parent = mntget(mnt);
        child_mnt->mnt_mountpoint = dget(dentry);
-        dentry->d_mounted++;
+        spin_lock(&dentry->d_lock);
+        dentry->d_flags |= DCACHE_MOUNTED;
+        spin_unlock(&dentry->d_lock);
 }
 /*
@@ -528,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
        list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
 }
+static inline void __mnt_make_longterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        atomic_inc(&mnt->mnt_longterm);
+#endif
+}
+/* needs vfsmount lock for write */
+static inline void __mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        atomic_dec(&mnt->mnt_longterm);
+#endif
+}
 /*
 * vfsmount lock must be held for write
 */
@@ -541,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt)
        BUG_ON(parent == mnt);
        list_add_tail(&head, &mnt->mnt_list);
-        list_for_each_entry(m, &head, mnt_list)
+        list_for_each_entry(m, &head, mnt_list) {
                m->mnt_ns = n;
+                __mnt_make_longterm(m);
+        }
        list_splice(&head, n->list.prev);
        list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -595,7 +696,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                                goto out_free;
                }
-                mnt->mnt_flags = old->mnt_flags;
+                mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
                atomic_inc(&sb->s_active);
                mnt->mnt_sb = sb;
                mnt->mnt_root = dget(root);
@@ -630,9 +731,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
        return NULL;
 }
-static inline void __mntput(struct vfsmount *mnt)
+static inline void mntfree(struct vfsmount *mnt)
 {
        struct super_block *sb = mnt->mnt_sb;
        /*
         * This probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this
@@ -640,38 +742,69 @@ static inline void __mntput(struct vfsmount *mnt)
         * to make r/w->r/o transitions.
         */
        /*
-         * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+         * The locking used to deal with mnt_count decrement provides barriers,
-         * provides barriers, so count_mnt_writers() below is safe.  AV
+         * so mnt_get_writers() below is safe.
         */
-        WARN_ON(count_mnt_writers(mnt));
+        WARN_ON(mnt_get_writers(mnt));
        fsnotify_vfsmount_delete(mnt);
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
 }
-void mntput_no_expire(struct vfsmount *mnt)
+static void mntput_no_expire(struct vfsmount *mnt)
 {
-repeat:
+put_again:
-        if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+#ifdef CONFIG_SMP
+        br_read_lock(vfsmount_lock);
+        if (likely(atomic_read(&mnt->mnt_longterm))) {
+                mnt_dec_count(mnt);
+                br_read_unlock(vfsmount_lock);
                return;
+        }
+        br_read_unlock(vfsmount_lock);
        br_write_lock(vfsmount_lock);
-        if (!atomic_dec_and_test(&mnt->mnt_count)) {
+        mnt_dec_count(mnt);
+        if (mnt_get_count(mnt)) {
                br_write_unlock(vfsmount_lock);
                return;
        }
-        if (likely(!mnt->mnt_pinned)) {
+#else
-                br_write_unlock(vfsmount_lock);
+        mnt_dec_count(mnt);
-                __mntput(mnt);
+        if (likely(mnt_get_count(mnt)))
                return;
+        br_write_lock(vfsmount_lock);
+#endif
+        if (unlikely(mnt->mnt_pinned)) {
+                mnt_add_count(mnt, mnt->mnt_pinned + 1);
+                mnt->mnt_pinned = 0;
+                br_write_unlock(vfsmount_lock);
+                acct_auto_close_mnt(mnt);
+                goto put_again;
        }
-        atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
-        mnt->mnt_pinned = 0;
        br_write_unlock(vfsmount_lock);
-        acct_auto_close_mnt(mnt);
+        mntfree(mnt);
-        goto repeat;
 }
-EXPORT_SYMBOL(mntput_no_expire);
+void mntput(struct vfsmount *mnt)
+{
+        if (mnt) {
+                /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+                if (unlikely(mnt->mnt_expiry_mark))
+                        mnt->mnt_expiry_mark = 0;
+                mntput_no_expire(mnt);
+        }
+}
+EXPORT_SYMBOL(mntput);
+struct vfsmount *mntget(struct vfsmount *mnt)
+{
+        if (mnt)
+                mnt_inc_count(mnt);
+        return mnt;
+}
+EXPORT_SYMBOL(mntget);
 void mnt_pin(struct vfsmount *mnt)
 {
@@ -679,19 +812,17 @@ void mnt_pin(struct vfsmount *mnt)
        mnt->mnt_pinned++;
        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 void mnt_unpin(struct vfsmount *mnt)
 {
        br_write_lock(vfsmount_lock);
        if (mnt->mnt_pinned) {
-                atomic_inc(&mnt->mnt_count);
+                mnt_inc_count(mnt);
                mnt->mnt_pinned--;
        }
        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
 static inline void mangle(struct seq_file *m, const char *s)
@@ -986,12 +1117,13 @@ int may_umount_tree(struct vfsmount *mnt)
        int minimum_refs = 0;
        struct vfsmount *p;
-        br_read_lock(vfsmount_lock);
+        /* write lock needed for mnt_get_count */
+        br_write_lock(vfsmount_lock);
        for (p = mnt; p; p = next_mnt(p, mnt)) {
-                actual_refs += atomic_read(&p->mnt_count);
+                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
-        br_read_unlock(vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        if (actual_refs > minimum_refs)
                return 0;
@@ -1018,10 +1150,10 @@ int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
        down_read(&namespace_sem);
-        br_read_lock(vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
-        br_read_unlock(vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_read(&namespace_sem);
        return ret;
 }
@@ -1058,26 +1190,29 @@ void release_mounts(struct list_head *head)
 */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
+        LIST_HEAD(tmp_list);
        struct vfsmount *p;
        for (p = mnt; p; p = next_mnt(p, mnt))
-                list_move(&p->mnt_hash, kill);
+                list_move(&p->mnt_hash, &tmp_list);
        if (propagate)
-                propagate_umount(kill);
+                propagate_umount(&tmp_list);
-        list_for_each_entry(p, kill, mnt_hash) {
+        list_for_each_entry(p, &tmp_list, mnt_hash) {
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                __touch_mnt_namespace(p->mnt_ns);
                p->mnt_ns = NULL;
+                __mnt_make_shortterm(p);
                list_del_init(&p->mnt_child);
                if (p->mnt_parent != p) {
                        p->mnt_parent->mnt_ghosts++;
-                        p->mnt_mountpoint->d_mounted--;
+                        dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint);
                }
                change_mnt_propagation(p, MS_PRIVATE);
        }
+        list_splice(&tmp_list, kill);
 }
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1103,8 +1238,16 @@ static int do_umount(struct vfsmount *mnt, int flags)
                    flags & (MNT_FORCE | MNT_DETACH))
                        return -EINVAL;
-                if (atomic_read(&mnt->mnt_count) != 2)
+                /*
+                 * probably don't strictly need the lock here if we examined
+                 * all race cases, but it's a slowpath.
+                 */
+                br_write_lock(vfsmount_lock);
+                if (mnt_get_count(mnt) != 2) {
+                        br_write_lock(vfsmount_lock);
                        return -EBUSY;
+                }
+                br_write_unlock(vfsmount_lock);
                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
@@ -1668,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name)
                return err;
        down_write(&namespace_sem);
-        while (d_mountpoint(path->dentry) &&
+        err = follow_down(path, true);
-               follow_down(path))
+        if (err < 0)
-                ;
+                goto out;
        err = -EINVAL;
        if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
                goto out;
@@ -1728,6 +1872,8 @@ out:
        return err;
 }
+static int do_add_mount(struct vfsmount *, struct path *, int);
 /*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
@@ -1736,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
                        int mnt_flags, char *name, void *data)
 {
        struct vfsmount *mnt;
+        int err;
        if (!type)
                return -EINVAL;
@@ -1744,21 +1891,51 @@ static int do_new_mount(struct path *path, char *type, int flags,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        lock_kernel();
        mnt = do_kern_mount(type, flags, name, data);
-        unlock_kernel();
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
-        return do_add_mount(mnt, path, mnt_flags, NULL);
+        err = do_add_mount(mnt, path, mnt_flags);
+        if (err)
+                mntput(mnt);
+        return err;
+}
+int finish_automount(struct vfsmount *m, struct path *path)
+{
+        int err;
+        /* The new mount record should have at least 2 refs to prevent it being
+         * expired before we get a chance to add it
+         */
+        BUG_ON(mnt_get_count(m) < 2);
+        if (m->mnt_sb == path->mnt->mnt_sb &&
+            m->mnt_root == path->dentry) {
+                err = -ELOOP;
+                goto fail;
+        }
+        err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+        if (!err)
+                return 0;
+fail:
+        /* remove m from any expiration list it may be on */
+        if (!list_empty(&m->mnt_expire)) {
+                down_write(&namespace_sem);
+                br_write_lock(vfsmount_lock);
+                list_del_init(&m->mnt_expire);
+                br_write_unlock(vfsmount_lock);
+                up_write(&namespace_sem);
+        }
+        mntput(m);
+        mntput(m);
+        return err;
 }
 /*
 * add a mount into a namespace's mount tree
- * - provide the option of adding the new mount to an expiration list
 */
-int do_add_mount(struct vfsmount *newmnt, struct path *path,
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
-                 int mnt_flags, struct list_head *fslist)
 {
        int err;
@@ -1766,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
        down_write(&namespace_sem);
        /* Something was mounted here while we slept */
-        while (d_mountpoint(path->dentry) &&
+        err = follow_down(path, true);
-               follow_down(path))
+        if (err < 0)
-                ;
+                goto unlock;
        err = -EINVAL;
        if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
                goto unlock;
@@ -1784,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
                goto unlock;
        newmnt->mnt_flags = mnt_flags;
-        if ((err = graft_tree(newmnt, path)))
+        err = graft_tree(newmnt, path);
-                goto unlock;
-        if (fslist) /* add to the specified expiration list */
-                list_add_tail(&newmnt->mnt_expire, fslist);
-        up_write(&namespace_sem);
-        return 0;
 unlock:
        up_write(&namespace_sem);
-        mntput(newmnt);
        return err;
 }
-EXPORT_SYMBOL_GPL(do_add_mount);
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+        down_write(&namespace_sem);
+        br_write_lock(vfsmount_lock);
+        list_add_tail(&mnt->mnt_expire, expiry_list);
+        br_write_unlock(vfsmount_lock);
+        up_write(&namespace_sem);
+}
+EXPORT_SYMBOL(mnt_set_expiry);
 /*
 * process a list of expirable mountpoints with the intent of discarding any
@@ -2088,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void)
        return new_ns;
 }
+void mnt_make_longterm(struct vfsmount *mnt)
+{
+        __mnt_make_longterm(mnt);
+}
+void mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
+                return;
+        br_write_lock(vfsmount_lock);
+        atomic_dec(&mnt->mnt_longterm);
+        br_write_unlock(vfsmount_lock);
+#endif
+}
 /*
 * Allocate a new namespace structure and populate it with contents
 * copied from the namespace of the passed in task structure.
@@ -2125,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        q = new_ns->root;
        while (p) {
                q->mnt_ns = new_ns;
+                __mnt_make_longterm(q);
                if (fs) {
                        if (p == fs->root.mnt) {
-                                rootmnt = p;
                                fs->root.mnt = mntget(q);
+                                __mnt_make_longterm(q);
+                                mnt_make_shortterm(p);
+                                rootmnt = p;
                        }
                        if (p == fs->pwd.mnt) {
-                                pwdmnt = p;
                                fs->pwd.mnt = mntget(q);
+                                __mnt_make_longterm(q);
+                                mnt_make_shortterm(p);
+                                pwdmnt = p;
                        }
                }
                p = next_mnt(p, mnt_ns->root);
@@ -2176,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
        new_ns = alloc_mnt_ns();
        if (!IS_ERR(new_ns)) {
                mnt->mnt_ns = new_ns;
+                __mnt_make_longterm(mnt);
                new_ns->root = mnt;
                list_add(&new_ns->list, &new_ns->root->mnt_list);
        }
@@ -2330,6 +2537,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        br_write_unlock(vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
        path_put(&root_parent);
        path_put(&parent_path);
@@ -2356,6 +2564,7 @@ static void __init init_mount_tree(void)
        mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");
        ns = create_mnt_ns(mnt);
        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9578cbe0cd58..f6946bb5cb55 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -17,13 +17,11 @@
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
-#include <linux/smp_lock.h>
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
-#include "ncplib_kernel.h"
 static void ncp_read_volume_list(struct file *, void *, filldir_t,
                                struct ncp_cache_control *);
@@ -75,11 +73,14 @@ const struct inode_operations ncp_dir_inode_operations =
 * Dentry operations routines
 */
 static int ncp_lookup_validate(struct dentry *, struct nameidata *);
-static int ncp_hash_dentry(struct dentry *, struct qstr *);
+static int ncp_hash_dentry(const struct dentry *, const struct inode *,
-static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
+                struct qstr *);
-static int ncp_delete_dentry(struct dentry *);
+static int ncp_compare_dentry(const struct dentry *, const struct inode *,
+                const struct dentry *, const struct inode *,
-static const struct dentry_operations ncp_dentry_operations =
+                unsigned int, const char *, const struct qstr *);
+static int ncp_delete_dentry(const struct dentry *);
+const struct dentry_operations ncp_dentry_operations =
 {
        .d_revalidate   = ncp_lookup_validate,
        .d_hash         = ncp_hash_dentry,
@@ -87,28 +88,49 @@ static const struct dentry_operations ncp_dentry_operations =
        .d_delete       = ncp_delete_dentry,
 };
-const struct dentry_operations ncp_root_dentry_operations =
+#define ncp_namespace(i)        (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
+static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
 {
-        .d_hash         = ncp_hash_dentry,
+#ifdef CONFIG_NCPFS_SMALLDOS
-        .d_compare      = ncp_compare_dentry,
+        int ns = ncp_namespace(i);
-        .d_delete       = ncp_delete_dentry,
-};
+        if ((ns == NW_NS_DOS)
+#ifdef CONFIG_NCPFS_OS2_NS
+                || ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
+#endif /* CONFIG_NCPFS_OS2_NS */
+           )
+                return 0;
+#endif /* CONFIG_NCPFS_SMALLDOS */
+        return 1;
+}
+#define ncp_preserve_case(i)    (ncp_namespace(i) != NW_NS_DOS)
+static inline int ncp_case_sensitive(const struct inode *i)
+{
+#ifdef CONFIG_NCPFS_NFS_NS
+        return ncp_namespace(i) == NW_NS_NFS;
+#else
+        return 0;
+#endif /* CONFIG_NCPFS_NFS_NS */
+}
 /*
 * Note: leave the hash unchanged if the directory
 * is case-sensitive.
 */
 static int 
-ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
+ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *this)
 {
-        struct nls_table *t;
+        if (!ncp_case_sensitive(inode)) {
-        unsigned long hash;
+                struct super_block *sb = dentry->d_sb;
-        int i;
+                struct nls_table *t;
+                unsigned long hash;
+                int i;
-        t = NCP_IO_TABLE(dentry);
+                t = NCP_IO_TABLE(sb);
-        if (!ncp_case_sensitive(dentry->d_inode)) {
                hash = init_name_hash();
                for (i=0; i<this->len ; i++)
                        hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -119,15 +141,17 @@ ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 }
 static int
-ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        if (a->len != b->len)
+        if (len != name->len)
                return 1;
-        if (ncp_case_sensitive(dentry->d_inode))
+        if (ncp_case_sensitive(pinode))
-                return strncmp(a->name, b->name, a->len);
+                return strncmp(str, name->name, len);
-        return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
+        return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len);
 }
 /*
@@ -136,7 +160,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
 * Closing files can be safely postponed until iput() - it's done there anyway.
 */
 static int
-ncp_delete_dentry(struct dentry * dentry)
+ncp_delete_dentry(const struct dentry * dentry)
 {
        struct inode *inode = dentry->d_inode;
@@ -266,7 +290,7 @@ leave_me:;
 static int
-__ncp_lookup_validate(struct dentry *dentry)
+ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
 {
        struct ncp_server *server;
        struct dentry *parent;
@@ -275,6 +299,12 @@ __ncp_lookup_validate(struct dentry *dentry)
        int res, val = 0, len;
        __u8 __name[NCP_MAXPATHLEN + 1];
+        if (dentry == dentry->d_sb->s_root)
+                return 1;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
@@ -283,9 +313,6 @@ __ncp_lookup_validate(struct dentry *dentry)
        server = NCP_SERVER(dir);
-        if (!ncp_conn_valid(server))
-                goto finished;
        /*
         * Inspired by smbfs:
         * The default validation is based on dentry age:
@@ -304,8 +331,11 @@ __ncp_lookup_validate(struct dentry *dentry)
        if (ncp_is_server_root(dir)) {
                res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
                                 dentry->d_name.len, 1);
-                if (!res)
+                if (!res) {
                        res = ncp_lookup_volume(server, __name, &(finfo.i));
+                        if (!res)
+                                ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+                }
        } else {
                res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
                                 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -320,13 +350,17 @@ __ncp_lookup_validate(struct dentry *dentry)
         * what we remember, it's not valid any more.
         */
        if (!res) {
-                if (finfo.i.dirEntNum == NCP_FINFO(dentry->d_inode)->dirEntNum) {
+                struct inode *inode = dentry->d_inode;
+                mutex_lock(&inode->i_mutex);
+                if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
                        ncp_new_dentry(dentry);
                        val=1;
                } else
                        DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
-                ncp_update_inode2(dentry->d_inode, &finfo);
+                ncp_update_inode2(inode, &finfo);
+                mutex_unlock(&inode->i_mutex);
        }
 finished:
@@ -335,16 +369,6 @@ finished:
        return val;
 }
-static int
-ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
-{
-        int res;
-        lock_kernel();
-        res = __ncp_lookup_validate(dentry);
-        unlock_kernel();
-        return res;
-}
 static struct dentry *
 ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 {
@@ -364,21 +388,21 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
        }
        /* If a pointer is invalid, we search the dentry. */
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dent = list_entry(next, struct dentry, d_u.d_child);
                if ((unsigned long)dent->d_fsdata == fpos) {
                        if (dent->d_inode)
-                                dget_locked(dent);
+                                dget(dent);
                        else
                                dent = NULL;
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&parent->d_lock);
                        goto out;
                }
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
        return NULL;
 out:
@@ -411,8 +435,6 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int result, mtime_valid = 0;
        time_t mtime = 0;
-        lock_kernel();
        ctl.page  = NULL;
        ctl.cache = NULL;
@@ -421,6 +443,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
                (int) filp->f_pos);
        result = -EIO;
+        /* Do not generate '.' and '..' when server is dead. */
        if (!ncp_conn_valid(server))
                goto out;
@@ -532,6 +555,12 @@ read_really:
        ctl.head.end = ctl.fpos - 1;
        ctl.head.eof = ctl.valid;
 finished:
+        if (ctl.page) {
+                kunmap(ctl.page);
+                SetPageUptodate(ctl.page);
+                unlock_page(ctl.page);
+                page_cache_release(ctl.page);
+        }
        if (page) {
                cache->head = ctl.head;
                kunmap(page);
@@ -539,23 +568,17 @@ finished:
                unlock_page(page);
                page_cache_release(page);
        }
-        if (ctl.page) {
-                kunmap(ctl.page);
-                SetPageUptodate(ctl.page);
-                unlock_page(ctl.page);
-                page_cache_release(ctl.page);
-        }
 out:
-        unlock_kernel();
        return result;
 }
 static int
 ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-                struct ncp_cache_control *ctrl, struct ncp_entry_info *entry)
+                struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
+                int inval_childs)
 {
        struct dentry *newdent, *dentry = filp->f_path.dentry;
-        struct inode *newino, *inode = dentry->d_inode;
+        struct inode *dir = dentry->d_inode;
        struct ncp_cache_control ctl = *ctrl;
        struct qstr qname;
        int valid = 0;
@@ -564,16 +587,16 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
        __u8 __name[NCP_MAXPATHLEN + 1];
        qname.len = sizeof(__name);
-        if (ncp_vol2io(NCP_SERVER(inode), __name, &qname.len,
+        if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len,
                        entry->i.entryName, entry->i.nameLen,
-                        !ncp_preserve_entry_case(inode, entry->i.NSCreator)))
+                        !ncp_preserve_entry_case(dir, entry->i.NSCreator)))
                return 1; /* I'm not sure */
        qname.name = __name;
        qname.hash = full_name_hash(qname.name, qname.len);
        if (dentry->d_op && dentry->d_op->d_hash)
-                if (dentry->d_op->d_hash(dentry, &qname) != 0)
+                if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
                        goto end_advance;
        newdent = d_lookup(dentry, &qname);
@@ -584,22 +607,40 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                        goto end_advance;
        } else {
                hashed = 1;
-                memcpy((char *) newdent->d_name.name, qname.name,
-                                                        newdent->d_name.len);
+                /* If case sensitivity changed for this volume, all entries below this one
+                   should be thrown away.  This entry itself is not affected, as its case
+                   sensitivity is controlled by its own parent. */
+                if (inval_childs)
+                        shrink_dcache_parent(newdent);
+                /*
+                 * NetWare's OS2 namespace is case preserving yet case
+                 * insensitive.  So we update dentry's name as received from
+                 * server. Parent dir's i_mutex is locked because we're in
+                 * readdir.
+                 */
+                dentry_update_name_case(newdent, &qname);
        }
        if (!newdent->d_inode) {
+                struct inode *inode;
                entry->opened = 0;
-                entry->ino = iunique(inode->i_sb, 2);
+                entry->ino = iunique(dir->i_sb, 2);
-                newino = ncp_iget(inode->i_sb, entry);
+                inode = ncp_iget(dir->i_sb, entry);
-                if (newino) {
+                if (inode) {
-                        newdent->d_op = &ncp_dentry_operations;
+                        d_instantiate(newdent, inode);
-                        d_instantiate(newdent, newino);
                        if (!hashed)
                                d_rehash(newdent);
                }
-        } else
+        } else {
-                ncp_update_inode2(newdent->d_inode, entry);
+                struct inode *inode = newdent->d_inode;
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+                ncp_update_inode2(inode, entry);
+                mutex_unlock(&inode->i_mutex);
+        }
        if (newdent->d_inode) {
                ino = newdent->d_inode->i_ino;
@@ -617,7 +658,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                ctl.cache = NULL;
                ctl.idx  -= NCP_DIRCACHE_SIZE;
                ctl.ofs  += 1;
-                ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
+                ctl.page  = grab_cache_page(&dir->i_data, ctl.ofs);
                if (ctl.page)
                        ctl.cache = kmap(ctl.page);
        }
@@ -633,7 +674,7 @@ end_advance:
                if (!ino)
                        ino = find_inode_number(dentry, &qname);
                if (!ino)
-                        ino = iunique(inode->i_sb, 2);
+                        ino = iunique(dir->i_sb, 2);
                ctl.filled = filldir(dirent, qname.name, qname.len,
                                     filp->f_pos, ino, DT_UNKNOWN);
                if (!ctl.filled)
@@ -660,6 +701,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
                        (unsigned long) filp->f_pos);
        for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
+                int inval_dentry;
                if (ncp_get_volume_info_with_number(server, i, &info) != 0)
                        return;
@@ -675,8 +717,9 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
                                info.volume_name);
                        continue;
                }
+                inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
                entry.volume = entry.i.volNumber;
-                if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry))
+                if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
                        return;
        }
 }
@@ -739,7 +782,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
                        rpl += onerpl;
                        rpls -= onerpl;
                        entry.volume = entry.i.volNumber;
-                        if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry))
+                        if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
                                break;
                }
        } while (more);
@@ -775,17 +818,19 @@ int ncp_conn_logged_in(struct super_block *sb)
                if (dent) {
                        struct inode* ino = dent->d_inode;
                        if (ino) {
+                                ncp_update_known_namespace(server, volNumber, NULL);
                                NCP_FINFO(ino)->volNumber = volNumber;
                                NCP_FINFO(ino)->dirEntNum = dirEntNum;
                                NCP_FINFO(ino)->DosDirNum = DosDirNum;
+                                result = 0;
                        } else {
                                DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
                        }
                } else {
                        DPRINTK("ncpfs: sb->s_root == NULL!\n");
                }
-        }
+        } else
-        result = 0;
+                result = 0;
 out:
        return result;
@@ -799,7 +844,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
        int error, res, len;
        __u8 __name[NCP_MAXPATHLEN + 1];
-        lock_kernel();
        error = -EIO;
        if (!ncp_conn_valid(server))
                goto finished;
@@ -813,6 +857,8 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
                                 dentry->d_name.len, 1);
                if (!res)
                        res = ncp_lookup_volume(server, __name, &(finfo.i));
+                        if (!res)
+                                ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
        } else {
                res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
                                 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -839,14 +885,12 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
        if (inode) {
                ncp_new_dentry(dentry);
 add_entry:
-                dentry->d_op = &ncp_dentry_operations;
                d_add(dentry, inode);
                error = 0;
        }
 finished:
        PPRINTK("ncp_lookup: result=%d\n", error);
-        unlock_kernel();
        return ERR_PTR(error);
 }
@@ -887,11 +931,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
        PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n",
                dentry->d_parent->d_name.name, dentry->d_name.name, mode);
-        error = -EIO;
-        lock_kernel();
-        if (!ncp_conn_valid(server))
-                goto out;
        ncp_age_dentry(server, dentry);
        len = sizeof(__name);
        error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -917,6 +956,8 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
                if (result) {
                        if (result == 0x87)
                                error = -ENAMETOOLONG;
+                        else if (result < 0)
+                                error = result;
                        DPRINTK("ncp_create: %s/%s failed\n",
                                dentry->d_parent->d_name.name, dentry->d_name.name);
                        goto out;
@@ -935,7 +976,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
        error = ncp_instantiate(dir, dentry, &finfo);
 out:
-        unlock_kernel();
        return error;
 }
@@ -955,11 +995,6 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        DPRINTK("ncp_mkdir: making %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        error = -EIO;
-        lock_kernel();
-        if (!ncp_conn_valid(server))
-                goto out;
        ncp_age_dentry(server, dentry);
        len = sizeof(__name);
        error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -967,12 +1002,11 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (error)
                goto out;
-        error = -EACCES;
+        error = ncp_open_create_file_or_subdir(server, dir, __name,
-        if (ncp_open_create_file_or_subdir(server, dir, __name,
                                           OC_MODE_CREATE, aDIR,
                                           cpu_to_le16(0xffff),
-                                           &finfo) == 0)
+                                           &finfo);
-        {
+        if (error == 0) {
                if (ncp_is_nfs_extras(server, finfo.volume)) {
                        mode |= S_IFDIR;
                        finfo.i.nfs.mode = mode;
@@ -983,9 +1017,10 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                                goto out;
                }
                error = ncp_instantiate(dir, dentry, &finfo);
+        } else if (error > 0) {
+                error = -EACCES;
        }
 out:
-        unlock_kernel();
        return error;
 }
@@ -998,11 +1033,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
        DPRINTK("ncp_rmdir: removing %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        error = -EIO;
-        lock_kernel();
-        if (!ncp_conn_valid(server))
-                goto out;
        error = -EBUSY;
        if (!d_unhashed(dentry))
                goto out;
@@ -1036,11 +1066,10 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
                        error = -ENOENT;
                        break;
                default:
-                        error = -EACCES;
+                        error = result < 0 ? result : -EACCES;
                        break;
        }
 out:
-        unlock_kernel();
        return error;
 }
@@ -1050,15 +1079,10 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
        struct ncp_server *server;
        int error;
-        lock_kernel();
        server = NCP_SERVER(dir);
        DPRINTK("ncp_unlink: unlinking %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
        
-        error = -EIO;
-        if (!ncp_conn_valid(server))
-                goto out;
        /*
         * Check whether to close the file ...
         */
@@ -1097,12 +1121,9 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
                        error = -ENOENT;
                        break;
                default:
-                        error = -EACCES;
+                        error = error < 0 ? error : -EACCES;
                        break;
        }
-                
-out:
-        unlock_kernel();
        return error;
 }
@@ -1118,11 +1139,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
                old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
-        error = -EIO;
-        lock_kernel();
-        if (!ncp_conn_valid(server))
-                goto out;
        ncp_age_dentry(server, old_dentry);
        ncp_age_dentry(server, new_dentry);
@@ -1161,11 +1177,10 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
                        error = -ENOENT;
                        break;
                default:
-                        error = -EACCES;
+                        error = error < 0 ? error : -EACCES;
                        break;
        }
 out:
-        unlock_kernel();
        return error;
 }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 3639cc5cbdae..0ed65e0c3dfe 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -17,10 +17,8 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
-#include "ncplib_kernel.h"
 static int ncp_fsync(struct file *file, int datasync)
 {
@@ -113,9 +111,6 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        DPRINTK("ncp_file_read: enter %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        if (!ncp_conn_valid(NCP_SERVER(inode)))
-                return -EIO;
        pos = *ppos;
        if ((ssize_t) count < 0) {
@@ -192,13 +187,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
        DPRINTK("ncp_file_write: enter %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        if (!ncp_conn_valid(NCP_SERVER(inode)))
-                return -EIO;
        if ((ssize_t) count < 0)
                return -EINVAL;
        pos = *ppos;
        if (file->f_flags & O_APPEND) {
-                pos = inode->i_size;
+                pos = i_size_read(inode);
        }
        if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
@@ -264,8 +257,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
        *ppos = pos;
-        if (pos > inode->i_size) {
+        if (pos > i_size_read(inode)) {
-                inode->i_size = pos;
+                mutex_lock(&inode->i_mutex);
+                if (pos > i_size_read(inode))
+                        i_size_write(inode, pos);
+                mutex_unlock(&inode->i_mutex);
        }
        DPRINTK("ncp_file_write: exit %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -281,18 +277,9 @@ static int ncp_release(struct inode *inode, struct file *file) {
        return 0;
 }
-static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
-{
-        loff_t ret;
-        lock_kernel();
-        ret = generic_file_llseek_unlocked(file, offset, origin);
-        unlock_kernel();
-        return ret;
-}
 const struct file_operations ncp_file_operations =
 {
-        .llseek         = ncp_remote_llseek,
+        .llseek         = generic_file_llseek,
        .read           = ncp_file_read,
        .write          = ncp_file_write,
        .unlocked_ioctl = ncp_ioctl,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b4de38cf49f5..00a1d1c3d3a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -26,16 +26,14 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/namei.h>
-#include <linux/ncp_fs.h>
 #include <net/sock.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 #include "getopt.h"
 #define NCP_DEFAULT_FILE_MODE 0600
@@ -59,11 +57,18 @@ static struct inode *ncp_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ncp_destroy_inode(struct inode *inode)
+static void ncp_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
+static void ncp_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ncp_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
@@ -139,7 +144,7 @@ static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
                inode->i_mode = nwi->nfs.mode;
        }
-        inode->i_blocks = (inode->i_size + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
+        inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
        inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate);
        inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate);
@@ -158,18 +163,21 @@ static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo)
                inode->i_mode = server->m.dir_mode;
                /* for directories dataStreamSize seems to be some
                   Object ID ??? */
-                inode->i_size = NCP_BLOCK_SIZE;
+                i_size_write(inode, NCP_BLOCK_SIZE);
        } else {
+                u32 size;
                inode->i_mode = server->m.file_mode;
-                inode->i_size = le32_to_cpu(nwi->dataStreamSize);
+                size = le32_to_cpu(nwi->dataStreamSize);
+                i_size_write(inode, size);
 #ifdef CONFIG_NCPFS_EXTRAS
                if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) 
                 && (nwi->attributes & aSHARED)) {
                        switch (nwi->attributes & (aHIDDEN|aSYSTEM)) {
                                case aHIDDEN:
                                        if (server->m.flags & NCP_MOUNT_SYMLINKS) {
-                                                if (/* (inode->i_size >= NCP_MIN_SYMLINK_SIZE)
+                                                if (/* (size >= NCP_MIN_SYMLINK_SIZE)
-                                                 && */ (inode->i_size <= NCP_MAX_SYMLINK_SIZE)) {
+                                                 && */ (size <= NCP_MAX_SYMLINK_SIZE)) {
                                                        inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
                                                        NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK;
                                                        break;
@@ -208,7 +216,7 @@ void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo)
 }
 /*
- * Fill in the inode based on the ncp_entry_info structure.
+ * Fill in the inode based on the ncp_entry_info structure.  Used only for brand new inodes.
 */
 static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 {
@@ -254,6 +262,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
        if (inode) {
                atomic_set(&NCP_FINFO(inode)->opened, info->opened);
+                inode->i_mapping->backing_dev_info = sb->s_bdi;
                inode->i_ino = info->ino;
                ncp_set_attr(inode, info);
                if (S_ISREG(inode->i_mode)) {
@@ -299,12 +308,19 @@ ncp_evict_inode(struct inode *inode)
 static void ncp_stop_tasks(struct ncp_server *server) {
        struct sock* sk = server->ncp_sock->sk;
-                
+        lock_sock(sk);
        sk->sk_error_report = server->error_report;
        sk->sk_data_ready   = server->data_ready;
        sk->sk_write_space  = server->write_space;
+        release_sock(sk);
        del_timer_sync(&server->timeout_tm);
-        flush_scheduled_work();
+        flush_work_sync(&server->rcv.tq);
+        if (sk->sk_socket->type == SOCK_STREAM)
+                flush_work_sync(&server->tx.tq);
+        else
+                flush_work_sync(&server->timeout_tq);
 }
 static int  ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -526,6 +542,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = NCP_SUPER_MAGIC;
        sb->s_op = &ncp_sops;
+        sb->s_d_op = &ncp_dentry_operations;
        sb->s_bdi = &server->bdi;
        server = NCP_SBP(sb);
@@ -565,10 +582,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 /*      server->conn_status = 0;        */
 /*      server->root_dentry = NULL;     */
 /*      server->root_setuped = 0;       */
+        mutex_init(&server->root_setup_lock);
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
 /*      server->sign_wanted = 0;        */
 /*      server->sign_active = 0;        */
 #endif
+        init_rwsem(&server->auth_rwsem);
        server->auth.auth_type = NCP_AUTH_NONE;
 /*      server->auth.object_name_len = 0;       */
 /*      server->auth.object_name = NULL;        */
@@ -593,16 +612,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        server->nls_io = load_nls_default();
 #endif /* CONFIG_NCPFS_NLS */
-        server->dentry_ttl = 0; /* no caching */
+        atomic_set(&server->dentry_ttl, 0);     /* no caching */
        INIT_LIST_HEAD(&server->tx.requests);
        mutex_init(&server->rcv.creq_mutex);
        server->tx.creq         = NULL;
        server->rcv.creq        = NULL;
-        server->data_ready      = sock->sk->sk_data_ready;
-        server->write_space     = sock->sk->sk_write_space;
-        server->error_report    = sock->sk->sk_error_report;
-        sock->sk->sk_user_data  = server;
        init_timer(&server->timeout_tm);
 #undef NCP_PACKET_SIZE
@@ -619,6 +634,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (server->rxbuf == NULL)
                goto out_txbuf;
+        lock_sock(sock->sk);
+        server->data_ready      = sock->sk->sk_data_ready;
+        server->write_space     = sock->sk->sk_write_space;
+        server->error_report    = sock->sk->sk_error_report;
+        sock->sk->sk_user_data  = server;
        sock->sk->sk_data_ready   = ncp_tcp_data_ready;
        sock->sk->sk_error_report = ncp_tcp_error_report;
        if (sock->type == SOCK_STREAM) {
@@ -634,6 +654,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
                server->timeout_tm.data = (unsigned long)server;
                server->timeout_tm.function = ncpdgram_timeout_call;
        }
+        release_sock(sock->sk);
        ncp_lock_server(server);
        error = ncp_connect(server);
@@ -658,8 +679,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
                                goto out_disconnect;
                        }
                }
+                ncp_lock_server(server);
                if (options & 2)
                        server->sign_wanted = 1;
+                ncp_unlock_server(server);
        }
        else 
 #endif  /* CONFIG_NCPFS_PACKET_SIGNING */
@@ -699,7 +722,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root)
                goto out_no_root;
-        sb->s_root->d_op = &ncp_root_dentry_operations;
        return 0;
 out_no_root:
@@ -720,6 +742,9 @@ out_nls:
        unload_nls(server->nls_io);
        unload_nls(server->nls_vol);
 #endif
+        mutex_destroy(&server->rcv.creq_mutex);
+        mutex_destroy(&server->root_setup_lock);
+        mutex_destroy(&server->mutex);
 out_fput2:
        if (server->info_filp)
                fput(server->info_filp);
@@ -743,8 +768,6 @@ static void ncp_put_super(struct super_block *sb)
 {
        struct ncp_server *server = NCP_SBP(sb);
-        lock_kernel();
        ncp_lock_server(server);
        ncp_disconnect(server);
        ncp_unlock_server(server);
@@ -756,6 +779,9 @@ static void ncp_put_super(struct super_block *sb)
        unload_nls(server->nls_vol);
        unload_nls(server->nls_io);
 #endif /* CONFIG_NCPFS_NLS */
+        mutex_destroy(&server->rcv.creq_mutex);
+        mutex_destroy(&server->root_setup_lock);
+        mutex_destroy(&server->mutex);
        if (server->info_filp)
                fput(server->info_filp);
@@ -771,8 +797,6 @@ static void ncp_put_super(struct super_block *sb)
        vfree(server->packet);
        sb->s_fs_info = NULL;
        kfree(server);
-        unlock_kernel();
 }
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -851,10 +875,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
        result = -EIO;
-        lock_kernel();  
        server = NCP_SERVER(inode);
-        if ((!server) || !ncp_conn_valid(server))
+        if (!server)    /* How this could happen? */
                goto out;
        /* ageing the dentry to force validation */
@@ -981,8 +1003,6 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
                result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode),
                                      inode, info_mask, &info);
                if (result != 0) {
-                        result = -EACCES;
                        if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) {
                                /* NetWare seems not to allow this. I
                                   do not know why. So, just tell the
@@ -1005,20 +1025,21 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
        mark_inode_dirty(inode);
 out:
-        unlock_kernel();
+        if (result > 0)
+                result = -EACCES;
        return result;
 }
-static int ncp_get_sb(struct file_system_type *fs_type,
+static struct dentry *ncp_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, ncp_fill_super);
 }
 static struct file_system_type ncp_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ncpfs",
-        .get_sb         = ncp_get_sb,
+        .mount          = ncp_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 84a8cfc4e38e..790e92a9ec63 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -17,15 +17,12 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/highuid.h>
-#include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/ncp_fs.h>
 #include <asm/uaccess.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 /* maximum limit for ncp_objectname_ioctl */
 #define NCP_OBJECT_NAME_MAX_LEN 4096
@@ -35,16 +32,11 @@
 #define NCP_PACKET_SIZE_INTERNAL 65536
 static int
-ncp_get_fs_info(struct ncp_server * server, struct file *file,
+ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
                struct ncp_fs_info __user *arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
        struct ncp_fs_info info;
-        if (file_permission(file, MAY_WRITE) != 0
-            && current_uid() != server->m.mounted_uid)
-                return -EACCES;
        if (copy_from_user(&info, arg, sizeof(info)))
                return -EFAULT;
@@ -65,16 +57,11 @@ ncp_get_fs_info(struct ncp_server * server, struct file *file,
 }
 static int
-ncp_get_fs_info_v2(struct ncp_server * server, struct file *file,
+ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
                   struct ncp_fs_info_v2 __user * arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
        struct ncp_fs_info_v2 info2;
-        if (file_permission(file, MAY_WRITE) != 0
-            && current_uid() != server->m.mounted_uid)
-                return -EACCES;
        if (copy_from_user(&info2, arg, sizeof(info2)))
                return -EFAULT;
@@ -136,16 +123,11 @@ struct compat_ncp_privatedata_ioctl
 #define NCP_IOC_SETPRIVATEDATA_32       _IOR('n', 10, struct compat_ncp_privatedata_ioctl)
 static int
-ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file,
+ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
                   struct compat_ncp_fs_info_v2 __user * arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
        struct compat_ncp_fs_info_v2 info2;
-        if (file_permission(file, MAY_WRITE) != 0
-            && current_uid() != server->m.mounted_uid)
-                return -EACCES;
        if (copy_from_user(&info2, arg, sizeof(info2)))
                return -EFAULT;
@@ -182,11 +164,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
        struct nls_table *iocharset;
        struct nls_table *oldset_io;
        struct nls_table *oldset_cp;
+        int utf8;
-        if (!capable(CAP_SYS_ADMIN))
+        int err;
-                return -EACCES;
-        if (server->root_setuped)
-                return -EBUSY;
        if (copy_from_user(&user, arg, sizeof(user)))
                return -EFAULT;
@@ -206,28 +185,40 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
        user.iocharset[NCP_IOCSNAME_LEN] = 0;
        if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) {
                iocharset = load_nls_default();
-                NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+                utf8 = 0;
        } else if (!strcmp(user.iocharset, "utf8")) {
                iocharset = load_nls_default();
-                NCP_SET_FLAG(server, NCP_FLAG_UTF8);
+                utf8 = 1;
        } else {
                iocharset = load_nls(user.iocharset);
                if (!iocharset) {
                        unload_nls(codepage);
                        return -EBADRQC;
                }
-                NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+                utf8 = 0;
        }
-        oldset_cp = server->nls_vol;
+        mutex_lock(&server->root_setup_lock);
-        server->nls_vol = codepage;
+        if (server->root_setuped) {
-        oldset_io = server->nls_io;
+                oldset_cp = codepage;
-        server->nls_io = iocharset;
+                oldset_io = iocharset;
+                err = -EBUSY;
+        } else {
+                if (utf8)
+                        NCP_SET_FLAG(server, NCP_FLAG_UTF8);
+                else
+                        NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+                oldset_cp = server->nls_vol;
+                server->nls_vol = codepage;
+                oldset_io = server->nls_io;
+                server->nls_io = iocharset;
+                err = 0;
+        }
+        mutex_unlock(&server->root_setup_lock);
        unload_nls(oldset_cp);
        unload_nls(oldset_io);
-        return 0;
+        return err;
 }
 static int
@@ -237,6 +228,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
        int len;
        memset(&user, 0, sizeof(user));
+        mutex_lock(&server->root_setup_lock);
        if (server->nls_vol && server->nls_vol->charset) {
                len = strlen(server->nls_vol->charset);
                if (len > NCP_IOCSNAME_LEN)
@@ -254,6 +246,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
                strncpy(user.iocharset, server->nls_io->charset, len);
                user.iocharset[len] = 0;
        }
+        mutex_unlock(&server->root_setup_lock);
        if (copy_to_user(arg, &user, sizeof(user)))
                return -EFAULT;
@@ -261,25 +254,19 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
-static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
        struct ncp_server *server = NCP_SERVER(inode);
        int result;
        struct ncp_ioctl_request request;
        char* bouncebuffer;
        void __user *argp = (void __user *)arg;
-        uid_t uid = current_uid();
        switch (cmd) {
 #ifdef CONFIG_COMPAT
        case NCP_IOC_NCPREQUEST_32:
 #endif
        case NCP_IOC_NCPREQUEST:
-                if (file_permission(filp, MAY_WRITE) != 0
-                    && uid != server->m.mounted_uid)
-                        return -EACCES;
 #ifdef CONFIG_COMPAT
                if (cmd == NCP_IOC_NCPREQUEST_32) {
                        struct compat_ncp_ioctl_request request32;
@@ -314,7 +301,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                server->current_size = request.size;
                memcpy(server->packet, bouncebuffer, request.size);
-                result = ncp_request2(server, request.function, 
+                result = ncp_request2(server, request.function,
                        bouncebuffer, NCP_PACKET_SIZE_INTERNAL);
                if (result < 0)
                        result = -EIO;
@@ -331,69 +318,69 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        case NCP_IOC_CONN_LOGGED_IN:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EACCES;
                if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE))
                        return -EINVAL;
+                mutex_lock(&server->root_setup_lock);
                if (server->root_setuped)
-                        return -EBUSY;
+                        result = -EBUSY;
-                server->root_setuped = 1;
+                else {
-                return ncp_conn_logged_in(inode->i_sb);
+                        result = ncp_conn_logged_in(inode->i_sb);
+                        if (result == 0)
+                                server->root_setuped = 1;
+                }
+                mutex_unlock(&server->root_setup_lock);
+                return result;
        case NCP_IOC_GET_FS_INFO:
-                return ncp_get_fs_info(server, filp, argp);
+                return ncp_get_fs_info(server, inode, argp);
        case NCP_IOC_GET_FS_INFO_V2:
-                return ncp_get_fs_info_v2(server, filp, argp);
+                return ncp_get_fs_info_v2(server, inode, argp);
 #ifdef CONFIG_COMPAT
        case NCP_IOC_GET_FS_INFO_V2_32:
-                return ncp_get_compat_fs_info_v2(server, filp, argp);
+                return ncp_get_compat_fs_info_v2(server, inode, argp);
 #endif
        /* we have too many combinations of CONFIG_COMPAT,
         * CONFIG_64BIT and CONFIG_UID16, so just handle
         * any of the possible ioctls */
        case NCP_IOC_GETMOUNTUID16:
-        case NCP_IOC_GETMOUNTUID32:
+                {
-        case NCP_IOC_GETMOUNTUID64:
-                if (file_permission(filp, MAY_READ) != 0
-                        && uid != server->m.mounted_uid)
-                        return -EACCES;
-                if (cmd == NCP_IOC_GETMOUNTUID16) {
                        u16 uid;
                        SET_UID(uid, server->m.mounted_uid);
                        if (put_user(uid, (u16 __user *)argp))
                                return -EFAULT;
-                } else if (cmd == NCP_IOC_GETMOUNTUID32) {
+                        return 0;
-                        if (put_user(server->m.mounted_uid,
-                                                (u32 __user *)argp))
-                                return -EFAULT;
-                } else {
-                        if (put_user(server->m.mounted_uid,
-                                                (u64 __user *)argp))
-                                return -EFAULT;
                }
+        case NCP_IOC_GETMOUNTUID32:
+                if (put_user(server->m.mounted_uid,
+                             (u32 __user *)argp))
+                        return -EFAULT;
+                return 0;
+        case NCP_IOC_GETMOUNTUID64:
+                if (put_user(server->m.mounted_uid,
+                             (u64 __user *)argp))
+                        return -EFAULT;
                return 0;
        case NCP_IOC_GETROOT:
                {
                        struct ncp_setroot_ioctl sr;
-                        if (file_permission(filp, MAY_READ) != 0
+                        result = -EACCES;
-                            && uid != server->m.mounted_uid)
+                        mutex_lock(&server->root_setup_lock);
-                                return -EACCES;
                        if (server->m.mounted_vol[0]) {
                                struct dentry* dentry = inode->i_sb->s_root;
                                if (dentry) {
                                        struct inode* s_inode = dentry->d_inode;
-                                
                                        if (s_inode) {
                                                sr.volNumber = NCP_FINFO(s_inode)->volNumber;
                                                sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum;
                                                sr.namespace = server->name_space[sr.volNumber];
+                                                result = 0;
                                        } else
                                                DPRINTK("ncpfs: s_root->d_inode==NULL\n");
                                } else
@@ -402,10 +389,12 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                                sr.volNumber = -1;
                                sr.namespace = 0;
                                sr.dirEntNum = 0;
+                                result = 0;
                        }
-                        if (copy_to_user(argp, &sr, sizeof(sr)))
+                        mutex_unlock(&server->root_setup_lock);
-                                return -EFAULT;
+                        if (!result && copy_to_user(argp, &sr, sizeof(sr)))
-                        return 0;
+                                result = -EFAULT;
+                        return result;
                }
        case NCP_IOC_SETROOT:
@@ -416,103 +405,114 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        __le32 dosde;
                        struct dentry* dentry;
-                        if (!capable(CAP_SYS_ADMIN))
-                        {
-                                return -EACCES;
-                        }
-                        if (server->root_setuped) return -EBUSY;
                        if (copy_from_user(&sr, argp, sizeof(sr)))
                                return -EFAULT;
-                        if (sr.volNumber < 0) {
+                        mutex_lock(&server->root_setup_lock);
-                                server->m.mounted_vol[0] = 0;
+                        if (server->root_setuped)
-                                vnum = NCP_NUMBER_OF_VOLUMES;
+                                result = -EBUSY;
-                                de = 0;
+                        else {
-                                dosde = 0;
+                                if (sr.volNumber < 0) {
-                        } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
+                                        server->m.mounted_vol[0] = 0;
-                                return -EINVAL;
+                                        vnum = NCP_NUMBER_OF_VOLUMES;
-                        } else if (ncp_mount_subdir(server, sr.volNumber,
+                                        de = 0;
-                                                sr.namespace, sr.dirEntNum,
+                                        dosde = 0;
-                                                &vnum, &de, &dosde)) {
+                                        result = 0;
-                                return -ENOENT;
+                                } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
-                        }
+                                        result = -EINVAL;
-                        
+                                } else if (ncp_mount_subdir(server, sr.volNumber,
-                        dentry = inode->i_sb->s_root;
+                                                        sr.namespace, sr.dirEntNum,
-                        server->root_setuped = 1;
+                                                        &vnum, &de, &dosde)) {
-                        if (dentry) {
+                                        result = -ENOENT;
-                                struct inode* s_inode = dentry->d_inode;
-                                
-                                if (s_inode) {
-                                        NCP_FINFO(s_inode)->volNumber = vnum;
-                                        NCP_FINFO(s_inode)->dirEntNum = de;
-                                        NCP_FINFO(s_inode)->DosDirNum = dosde;
                                } else
-                                        DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+                                        result = 0;
-                        } else
-                                DPRINTK("ncpfs: s_root==NULL\n");
+                                if (result == 0) {
+                                        dentry = inode->i_sb->s_root;
+                                        if (dentry) {
+                                                struct inode* s_inode = dentry->d_inode;
+                                                if (s_inode) {
+                                                        NCP_FINFO(s_inode)->volNumber = vnum;
+                                                        NCP_FINFO(s_inode)->dirEntNum = de;
+                                                        NCP_FINFO(s_inode)->DosDirNum = dosde;
+                                                        server->root_setuped = 1;
+                                                } else {
+                                                        DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+                                                        result = -EIO;
+                                                }
+                                        } else {
+                                                DPRINTK("ncpfs: s_root==NULL\n");
+                                                result = -EIO;
+                                        }
+                                }
+                                result = 0;
+                        }
+                        mutex_unlock(&server->root_setup_lock);
-                        return 0;
+                        return result;
                }
-#ifdef CONFIG_NCPFS_PACKET_SIGNING      
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
        case NCP_IOC_SIGN_INIT:
-                if (file_permission(filp, MAY_WRITE) != 0
+                {
-                    && uid != server->m.mounted_uid)
+                        struct ncp_sign_init sign;
-                        return -EACCES;
-                if (argp) {
-                        if (server->sign_wanted)
-                        {
-                                struct ncp_sign_init sign;
+                        if (argp)
                                if (copy_from_user(&sign, argp, sizeof(sign)))
                                        return -EFAULT;
-                                memcpy(server->sign_root,sign.sign_root,8);
+                        ncp_lock_server(server);
-                                memcpy(server->sign_last,sign.sign_last,16);
+                        mutex_lock(&server->rcv.creq_mutex);
-                                server->sign_active = 1;
+                        if (argp) {
+                                if (server->sign_wanted) {
+                                        memcpy(server->sign_root,sign.sign_root,8);
+                                        memcpy(server->sign_last,sign.sign_last,16);
+                                        server->sign_active = 1;
+                                }
+                                /* ignore when signatures not wanted */
+                        } else {
+                                server->sign_active = 0;
                        }
-                        /* ignore when signatures not wanted */
+                        mutex_unlock(&server->rcv.creq_mutex);
-                } else {
+                        ncp_unlock_server(server);
-                        server->sign_active = 0;
+                        return 0;
                }
-                return 0;               
-                
        case NCP_IOC_SIGN_WANTED:
-                if (file_permission(filp, MAY_READ) != 0
+                {
-                    && uid != server->m.mounted_uid)
+                        int state;
-                        return -EACCES;
-                
+                        ncp_lock_server(server);
-                if (put_user(server->sign_wanted, (int __user *)argp))
+                        state = server->sign_wanted;
-                        return -EFAULT;
+                        ncp_unlock_server(server);
-                return 0;
+                        if (put_user(state, (int __user *)argp))
+                                return -EFAULT;
+                        return 0;
+                }
        case NCP_IOC_SET_SIGN_WANTED:
                {
                        int newstate;
-                        if (file_permission(filp, MAY_WRITE) != 0
-                            && uid != server->m.mounted_uid)
-                                return -EACCES;
                        /* get only low 8 bits... */
                        if (get_user(newstate, (unsigned char __user *)argp))
                                return -EFAULT;
+                        result = 0;
+                        ncp_lock_server(server);
                        if (server->sign_active) {
                                /* cannot turn signatures OFF when active */
-                                if (!newstate) return -EINVAL;
+                                if (!newstate)
+                                        result = -EINVAL;
                        } else {
                                server->sign_wanted = newstate != 0;
                        }
-                        return 0;
+                        ncp_unlock_server(server);
+                        return result;
                }
 #endif /* CONFIG_NCPFS_PACKET_SIGNING */
 #ifdef CONFIG_NCPFS_IOCTL_LOCKING
        case NCP_IOC_LOCKUNLOCK:
-                if (file_permission(filp, MAY_WRITE) != 0
-                    && uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct ncp_lock_ioctl    rqdata;
@@ -541,16 +541,13 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        {
                                return result;
                        }
-                        result = -EIO;
-                        if (!ncp_conn_valid(server))
-                                goto outrel;
                        result = -EISDIR;
                        if (!S_ISREG(inode->i_mode))
                                goto outrel;
                        if (rqdata.cmd == NCP_LOCK_CLEAR)
                        {
                                result = ncp_ClearPhysicalRecord(NCP_SERVER(inode),
-                                                        NCP_FINFO(inode)->file_handle, 
+                                                        NCP_FINFO(inode)->file_handle,
                                                        rqdata.offset,
                                                        rqdata.length);
                                if (result > 0) result = 0;     /* no such lock */
@@ -573,7 +570,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                                                        rqdata.timeout);
                                if (result > 0) result = -EAGAIN;
                        }
-outrel:                 
+outrel:
                        ncp_inode_close(inode);
                        return result;
                }
@@ -581,60 +578,62 @@ outrel:
 #ifdef CONFIG_COMPAT
        case NCP_IOC_GETOBJECTNAME_32:
-                if (uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct compat_ncp_objectname_ioctl user;
                        size_t outl;
                        if (copy_from_user(&user, argp, sizeof(user)))
                                return -EFAULT;
+                        down_read(&server->auth_rwsem);
                        user.auth_type = server->auth.auth_type;
                        outl = user.object_name_len;
                        user.object_name_len = server->auth.object_name_len;
                        if (outl > user.object_name_len)
                                outl = user.object_name_len;
+                        result = 0;
                        if (outl) {
                                if (copy_to_user(compat_ptr(user.object_name),
                                                 server->auth.object_name,
-                                                 outl)) return -EFAULT;
+                                                 outl))
+                                        result = -EFAULT;
                        }
-                        if (copy_to_user(argp, &user, sizeof(user)))
+                        up_read(&server->auth_rwsem);
-                                return -EFAULT;
+                        if (!result && copy_to_user(argp, &user, sizeof(user)))
-                        return 0;
+                                result = -EFAULT;
+                        return result;
                }
 #endif
        case NCP_IOC_GETOBJECTNAME:
-                if (uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct ncp_objectname_ioctl user;
                        size_t outl;
                        if (copy_from_user(&user, argp, sizeof(user)))
                                return -EFAULT;
+                        down_read(&server->auth_rwsem);
                        user.auth_type = server->auth.auth_type;
                        outl = user.object_name_len;
                        user.object_name_len = server->auth.object_name_len;
                        if (outl > user.object_name_len)
                                outl = user.object_name_len;
+                        result = 0;
                        if (outl) {
                                if (copy_to_user(user.object_name,
                                                 server->auth.object_name,
-                                                 outl)) return -EFAULT;
+                                                 outl))
+                                        result = -EFAULT;
                        }
-                        if (copy_to_user(argp, &user, sizeof(user)))
+                        up_read(&server->auth_rwsem);
-                                return -EFAULT;
+                        if (!result && copy_to_user(argp, &user, sizeof(user)))
-                        return 0;
+                                result = -EFAULT;
+                        return result;
                }
 #ifdef CONFIG_COMPAT
        case NCP_IOC_SETOBJECTNAME_32:
 #endif
        case NCP_IOC_SETOBJECTNAME:
-                if (uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct ncp_objectname_ioctl user;
                        void* newname;
@@ -666,9 +665,7 @@ outrel:
                        } else {
                                newname = NULL;
                        }
-                        /* enter critical section */
+                        down_write(&server->auth_rwsem);
-                        /* maybe that kfree can sleep so do that this way */
-                        /* it is at least more SMP friendly (in future...) */
                        oldname = server->auth.object_name;
                        oldnamelen = server->auth.object_name_len;
                        oldprivate = server->priv.data;
@@ -678,7 +675,7 @@ outrel:
                        server->auth.object_name = newname;
                        server->priv.len = 0;
                        server->priv.data = NULL;
-                        /* leave critical section */
+                        up_write(&server->auth_rwsem);
                        kfree(oldprivate);
                        kfree(oldname);
                        return 0;
@@ -688,8 +685,6 @@ outrel:
        case NCP_IOC_GETPRIVATEDATA_32:
 #endif
        case NCP_IOC_GETPRIVATEDATA:
-                if (uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct ncp_privatedata_ioctl user;
                        size_t outl;
@@ -706,14 +701,20 @@ outrel:
                        if (copy_from_user(&user, argp, sizeof(user)))
                                return -EFAULT;
+                        down_read(&server->auth_rwsem);
                        outl = user.len;
                        user.len = server->priv.len;
                        if (outl > user.len) outl = user.len;
+                        result = 0;
                        if (outl) {
                                if (copy_to_user(user.data,
                                                 server->priv.data,
-                                                 outl)) return -EFAULT;
+                                                 outl))
+                                        result = -EFAULT;
                        }
+                        up_read(&server->auth_rwsem);
+                        if (result)
+                                return result;
 #ifdef CONFIG_COMPAT
                        if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
                                struct compat_ncp_privatedata_ioctl user32;
@@ -733,8 +734,6 @@ outrel:
        case NCP_IOC_SETPRIVATEDATA_32:
 #endif
        case NCP_IOC_SETPRIVATEDATA:
-                if (uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct ncp_privatedata_ioctl user;
                        void* new;
@@ -762,12 +761,12 @@ outrel:
                        } else {
                                new = NULL;
                        }
-                        /* enter critical section */
+                        down_write(&server->auth_rwsem);
                        old = server->priv.data;
                        oldlen = server->priv.len;
                        server->priv.len = user.len;
                        server->priv.data = new;
-                        /* leave critical section */
+                        up_write(&server->auth_rwsem);
                        kfree(old);
                        return 0;
                }
@@ -775,17 +774,13 @@ outrel:
 #ifdef CONFIG_NCPFS_NLS
        case NCP_IOC_SETCHARSETS:
                return ncp_set_charsets(server, argp);
-                
        case NCP_IOC_GETCHARSETS:
                return ncp_get_charsets(server, argp);
 #endif /* CONFIG_NCPFS_NLS */
        case NCP_IOC_SETDENTRYTTL:
-                if (file_permission(filp, MAY_WRITE) != 0 &&
-                    uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        u_int32_t user;
@@ -795,13 +790,13 @@ outrel:
                        if (user > 20000)
                                return -EINVAL;
                        user = (user * HZ) / 1000;
-                        server->dentry_ttl = user;
+                        atomic_set(&server->dentry_ttl, user);
                        return 0;
                }
-                
        case NCP_IOC_GETDENTRYTTL:
                {
-                        u_int32_t user = (server->dentry_ttl * 1000) / HZ;
+                        u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ;
                        if (copy_to_user(argp, &user, sizeof(user)))
                                return -EFAULT;
                        return 0;
@@ -811,59 +806,103 @@ outrel:
        return -EINVAL;
 }
-static int ncp_ioctl_need_write(unsigned int cmd)
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct ncp_server *server = NCP_SERVER(inode);
+        uid_t uid = current_uid();
+        int need_drop_write = 0;
+        long ret;
        switch (cmd) {
-        case NCP_IOC_GET_FS_INFO:
-        case NCP_IOC_GET_FS_INFO_V2:
-        case NCP_IOC_NCPREQUEST:
-        case NCP_IOC_SETDENTRYTTL:
-        case NCP_IOC_SIGN_INIT:
-        case NCP_IOC_LOCKUNLOCK:
-        case NCP_IOC_SET_SIGN_WANTED:
-                return 1;
-        case NCP_IOC_GETOBJECTNAME:
-        case NCP_IOC_SETOBJECTNAME:
-        case NCP_IOC_GETPRIVATEDATA:
-        case NCP_IOC_SETPRIVATEDATA:
        case NCP_IOC_SETCHARSETS:
-        case NCP_IOC_GETCHARSETS:
        case NCP_IOC_CONN_LOGGED_IN:
-        case NCP_IOC_GETDENTRYTTL:
-        case NCP_IOC_GETMOUNTUID2:
-        case NCP_IOC_SIGN_WANTED:
-        case NCP_IOC_GETROOT:
        case NCP_IOC_SETROOT:
-                return 0;
+                if (!capable(CAP_SYS_ADMIN)) {
-        default:
+                        ret = -EACCES;
-                /* unknown IOCTL command, assume write */
+                        goto out;
-                return 1;
+                }
+                break;
        }
-}
+        if (server->m.mounted_uid != uid) {
+                switch (cmd) {
-long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-        long ret;
-        lock_kernel();
-        if (ncp_ioctl_need_write(cmd)) {
                /*
-                 * inside the ioctl(), any failures which
+                 * Only mount owner can issue these ioctls.  Information
-                 * are because of file_permission() are
+                 * necessary to authenticate to other NDS servers are
-                 * -EACCESS, so it seems consistent to keep
+                 * stored here.
-                 *  that here.
                 */
-                if (mnt_want_write(filp->f_path.mnt)) {
+                case NCP_IOC_GETOBJECTNAME:
+                case NCP_IOC_SETOBJECTNAME:
+                case NCP_IOC_GETPRIVATEDATA:
+                case NCP_IOC_SETPRIVATEDATA:
+#ifdef CONFIG_COMPAT
+                case NCP_IOC_GETOBJECTNAME_32:
+                case NCP_IOC_SETOBJECTNAME_32:
+                case NCP_IOC_GETPRIVATEDATA_32:
+                case NCP_IOC_SETPRIVATEDATA_32:
+#endif
                        ret = -EACCES;
                        goto out;
+                /*
+                 * These require write access on the inode if user id
+                 * does not match.  Note that they do not write to the
+                 * file...  But old code did mnt_want_write, so I keep
+                 * it as is.  Of course not for mountpoint owner, as
+                 * that breaks read-only mounts altogether as ncpmount
+                 * needs working NCP_IOC_NCPREQUEST and
+                 * NCP_IOC_GET_FS_INFO.  Some of these codes (setdentryttl,
+                 * signinit, setsignwanted) should be probably restricted
+                 * to owner only, or even more to CAP_SYS_ADMIN).
+                 */
+                case NCP_IOC_GET_FS_INFO:
+                case NCP_IOC_GET_FS_INFO_V2:
+                case NCP_IOC_NCPREQUEST:
+                case NCP_IOC_SETDENTRYTTL:
+                case NCP_IOC_SIGN_INIT:
+                case NCP_IOC_LOCKUNLOCK:
+                case NCP_IOC_SET_SIGN_WANTED:
+#ifdef CONFIG_COMPAT
+                case NCP_IOC_GET_FS_INFO_V2_32:
+                case NCP_IOC_NCPREQUEST_32:
+#endif
+                        ret = mnt_want_write_file(filp);
+                        if (ret)
+                                goto out;
+                        need_drop_write = 1;
+                        ret = inode_permission(inode, MAY_WRITE);
+                        if (ret)
+                                goto outDropWrite;
+                        break;
+                /*
+                 * Read access required.
+                 */
+                case NCP_IOC_GETMOUNTUID16:
+                case NCP_IOC_GETMOUNTUID32:
+                case NCP_IOC_GETMOUNTUID64:
+                case NCP_IOC_GETROOT:
+                case NCP_IOC_SIGN_WANTED:
+                        ret = inode_permission(inode, MAY_READ);
+                        if (ret)
+                                goto out;
+                        break;
+                /*
+                 * Anybody can read these.
+                 */
+                case NCP_IOC_GETCHARSETS:
+                case NCP_IOC_GETDENTRYTTL:
+                default:
+                /* Three codes below are protected by CAP_SYS_ADMIN above. */
+                case NCP_IOC_SETCHARSETS:
+                case NCP_IOC_CONN_LOGGED_IN:
+                case NCP_IOC_SETROOT:
+                        break;
                }
        }
-        ret = __ncp_ioctl(filp, cmd, arg);
+        ret = __ncp_ioctl(inode, cmd, arg);
-        if (ncp_ioctl_need_write(cmd))
+outDropWrite:
+        if (need_drop_write)
                mnt_drop_write(filp->f_path.mnt);
 out:
-        unlock_kernel();
        return ret;
 }
@@ -872,10 +911,8 @@ long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        long ret;
-        lock_kernel();
        arg = (unsigned long) compat_ptr(arg);
        ret = ncp_ioctl(file, cmd, arg);
-        unlock_kernel();
        return ret;
 }
 #endif
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 56f5b3a0e1ee..a7c07b44b100 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,12 +16,12 @@
 #include <linux/mman.h>
 #include <linux/string.h>
 #include <linux/fcntl.h>
-#include <linux/ncp_fs.h>
-#include "ncplib_kernel.h"
 #include <asm/uaccess.h>
 #include <asm/system.h>
+#include "ncp_fs.h"
 /*
 * Fill in the supplied page for mmap
 * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
new file mode 100644
index 000000000000..31831afe1c3b
--- /dev/null
+++ b/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,98 @@
+#include <linux/ncp_fs.h>
+#include "ncp_fs_i.h"
+#include "ncp_fs_sb.h"
+/* define because it is easy to change PRINTK to {*}PRINTK */
+#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
+#undef NCPFS_PARANOIA
+#ifdef NCPFS_PARANOIA
+#define PPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define PPRINTK(format, args...)
+#endif
+#ifndef DEBUG_NCP
+#define DEBUG_NCP 0
+#endif
+#if DEBUG_NCP > 0
+#define DPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DPRINTK(format, args...)
+#endif
+#if DEBUG_NCP > 1
+#define DDPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DDPRINTK(format, args...)
+#endif
+#define NCP_MAX_RPC_TIMEOUT (6*HZ)
+struct ncp_entry_info {
+        struct nw_info_struct   i;
+        ino_t                   ino;
+        int                     opened;
+        int                     access;
+        unsigned int            volume;
+        __u8                    file_handle[6];
+};
+static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+#define NCP_SERVER(inode)       NCP_SBP((inode)->i_sb)
+static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
+{
+        return container_of(inode, struct ncp_inode_info, vfs_inode);
+}
+/* linux/fs/ncpfs/inode.c */
+int ncp_notify_change(struct dentry *, struct iattr *);
+struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
+void ncp_update_inode(struct inode *, struct ncp_entry_info *);
+void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
+/* linux/fs/ncpfs/dir.c */
+extern const struct inode_operations ncp_dir_inode_operations;
+extern const struct file_operations ncp_dir_operations;
+extern const struct dentry_operations ncp_dentry_operations;
+int ncp_conn_logged_in(struct super_block *);
+int ncp_date_dos2unix(__le16 time, __le16 date);
+void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
+/* linux/fs/ncpfs/ioctl.c */
+long ncp_ioctl(struct file *, unsigned int, unsigned long);
+long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
+/* linux/fs/ncpfs/sock.c */
+int ncp_request2(struct ncp_server *server, int function,
+        void* reply, int max_reply_size);
+static inline int ncp_request(struct ncp_server *server, int function) {
+        return ncp_request2(server, function, server->packet, server->packet_size);
+}
+int ncp_connect(struct ncp_server *server);
+int ncp_disconnect(struct ncp_server *server);
+void ncp_lock_server(struct ncp_server *server);
+void ncp_unlock_server(struct ncp_server *server);
+/* linux/fs/ncpfs/symlink.c */
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+extern const struct address_space_operations ncp_symlink_aops;
+int ncp_symlink(struct inode*, struct dentry*, const char*);
+#endif
+/* linux/fs/ncpfs/file.c */
+extern const struct inode_operations ncp_file_inode_operations;
+extern const struct file_operations ncp_file_operations;
+int ncp_make_open(struct inode *, int);
+/* linux/fs/ncpfs/mmap.c */
+int ncp_mmap(struct file *, struct vm_area_struct *);
+/* linux/fs/ncpfs/ncplib_kernel.c */
+int ncp_make_closed(struct inode *);
+#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 000000000000..4b0bec477846
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,29 @@
+/*
+ *  ncp_fs_i.h
+ *
+ *  Copyright (C) 1995 Volker Lendecke
+ *
+ */
+#ifndef _LINUX_NCP_FS_I
+#define _LINUX_NCP_FS_I
+/*
+ * This is the ncpfs part of the inode structure. This must contain
+ * all the information we need to work with an inode after creation.
+ */
+struct ncp_inode_info {
+        __le32  dirEntNum;
+        __le32  DosDirNum;
+        __u8    volNumber;
+        __le32  nwattr;
+        struct mutex open_mutex;
+        atomic_t        opened;
+        int     access;
+        int     flags;
+#define NCPI_KLUDGE_SYMLINK     0x0001
+        __u8    file_handle[6];
+        struct inode vfs_inode;
+};
+#endif  /* _LINUX_NCP_FS_I */
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 000000000000..4af803f13516
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,176 @@
+/*
+ *  ncp_fs_sb.h
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *
+ */
+#ifndef _NCP_FS_SB
+#define _NCP_FS_SB
+#include <linux/types.h>
+#include <linux/ncp_mount.h>
+#include <linux/net.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/workqueue.h>
+#define NCP_DEFAULT_OPTIONS 0           /* 2 for packet signatures */
+struct sock;
+struct ncp_mount_data_kernel {
+        unsigned long    flags;         /* NCP_MOUNT_* flags */
+        unsigned int     int_flags;     /* internal flags */
+#define NCP_IMOUNT_LOGGEDIN_POSSIBLE    0x0001
+        __kernel_uid32_t mounted_uid;   /* Who may umount() this filesystem? */
+        struct pid      *wdog_pid;      /* Who cares for our watchdog packets? */
+        unsigned int     ncp_fd;        /* The socket to the ncp port */
+        unsigned int     time_out;      /* How long should I wait after
+                                           sending a NCP request? */
+        unsigned int     retry_count;   /* And how often should I retry? */
+        unsigned char    mounted_vol[NCP_VOLNAME_LEN + 1];
+        __kernel_uid32_t uid;
+        __kernel_gid32_t gid;
+        __kernel_mode_t  file_mode;
+        __kernel_mode_t  dir_mode;
+        int              info_fd;
+};
+struct ncp_server {
+        struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of
+                                           interest for us later, so we store
+                                           it completely. */
+        __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
+        struct file *ncp_filp;  /* File pointer to ncp socket */
+        struct socket *ncp_sock;/* ncp socket */
+        struct file *info_filp;
+        struct socket *info_sock;
+        u8 sequence;
+        u8 task;
+        u16 connection;         /* Remote connection number */
+        u8 completion;          /* Status message from server */
+        u8 conn_status;         /* Bit 4 = 1 ==> Server going down, no
+                                   requests allowed anymore.
+                                   Bit 0 = 1 ==> Server is down. */
+        int buffer_size;        /* Negotiated bufsize */
+        int reply_size;         /* Size of last reply */
+        int packet_size;
+        unsigned char *packet;  /* Here we prepare requests and
+                                   receive replies */
+        unsigned char *txbuf;   /* Storage for current request */
+        unsigned char *rxbuf;   /* Storage for reply to current request */
+        int lock;               /* To prevent mismatch in protocols. */
+        struct mutex mutex;
+        int current_size;       /* for packet preparation */
+        int has_subfunction;
+        int ncp_reply_size;
+        int root_setuped;
+        struct mutex root_setup_lock;
+        /* info for packet signing */
+        int sign_wanted;        /* 1=Server needs signed packets */
+        int sign_active;        /* 0=don't do signing, 1=do */
+        char sign_root[8];      /* generated from password and encr. key */
+        char sign_last[16];     
+        /* Authentication info: NDS or BINDERY, username */
+        struct {
+                int     auth_type;
+                size_t  object_name_len;
+                void*   object_name;
+                int     object_type;
+        } auth;
+        /* Password info */
+        struct {
+                size_t  len;
+                void*   data;
+        } priv;
+        struct rw_semaphore auth_rwsem;
+        /* nls info: codepage for volume and charset for I/O */
+        struct nls_table *nls_vol;
+        struct nls_table *nls_io;
+        /* maximum age in jiffies */
+        atomic_t dentry_ttl;
+        /* miscellaneous */
+        unsigned int flags;
+        spinlock_t requests_lock;       /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
+        void (*data_ready)(struct sock* sk, int len);
+        void (*error_report)(struct sock* sk);
+        void (*write_space)(struct sock* sk);   /* STREAM mode only */
+        struct {
+                struct work_struct tq;          /* STREAM/DGRAM: data/error ready */
+                struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */
+                struct mutex creq_mutex;        /* DGRAM only: lock accesses to rcv.creq */
+                unsigned int state;             /* STREAM only: receiver state */
+                struct {
+                        __u32 magic __packed;
+                        __u32 len __packed;
+                        __u16 type __packed;
+                        __u16 p1 __packed;
+                        __u16 p2 __packed;
+                        __u16 p3 __packed;
+                        __u16 type2 __packed;
+                } buf;                          /* STREAM only: temporary buffer */
+                unsigned char* ptr;             /* STREAM only: pointer to data */
+                size_t len;                     /* STREAM only: length of data to receive */
+        } rcv;
+        struct {
+                struct list_head requests;      /* STREAM only: queued requests */
+                struct work_struct tq;          /* STREAM only: transmitter ready */
+                struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */
+        } tx;
+        struct timer_list timeout_tm;           /* DGRAM only: timeout timer */
+        struct work_struct timeout_tq;          /* DGRAM only: associated queue, we run timers from process context */
+        int timeout_last;                       /* DGRAM only: current timeout length */
+        int timeout_retries;                    /* DGRAM only: retries left */
+        struct {
+                size_t len;
+                __u8 data[128];
+        } unexpected_packet;
+        struct backing_dev_info bdi;
+};
+extern void ncp_tcp_rcv_proc(struct work_struct *work);
+extern void ncp_tcp_tx_proc(struct work_struct *work);
+extern void ncpdgram_rcv_proc(struct work_struct *work);
+extern void ncpdgram_timeout_proc(struct work_struct *work);
+extern void ncpdgram_timeout_call(unsigned long server);
+extern void ncp_tcp_data_ready(struct sock* sk, int len);
+extern void ncp_tcp_write_space(struct sock* sk);
+extern void ncp_tcp_error_report(struct sock* sk);
+#define NCP_FLAG_UTF8   1
+#define NCP_CLR_FLAG(server, flag)      ((server)->flags &= ~(flag))
+#define NCP_SET_FLAG(server, flag)      ((server)->flags |= (flag))
+#define NCP_IS_FLAG(server, flag)       ((server)->flags & (flag))
+static inline int ncp_conn_valid(struct ncp_server *server)
+{
+        return ((server->conn_status & 0x11) == 0);
+}
+static inline void ncp_invalidate_conn(struct ncp_server *server)
+{
+        server->conn_status |= 0x01;
+}
+#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 0ec6237a5970..981a95617fc9 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -11,7 +11,7 @@
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 static inline void assert_server_locked(struct ncp_server *server)
 {
@@ -107,17 +107,17 @@ ncp_reply_data(struct ncp_server *server, int offset)
        return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
 }
-static inline u8 BVAL(void *data)
+static inline u8 BVAL(const void *data)
 {
-        return *(u8 *)data;
+        return *(const u8 *)data;
 }
 static u8 ncp_reply_byte(struct ncp_server *server, int offset)
 {
-        return *(u8 *)ncp_reply_data(server, offset);
+        return *(const u8 *)ncp_reply_data(server, offset);
 }
-static inline u16 WVAL_LH(void *data)
+static inline u16 WVAL_LH(const void *data)
 {
        return get_unaligned_le16(data);
 }
@@ -134,7 +134,7 @@ ncp_reply_be16(struct ncp_server *server, int offset)
        return get_unaligned_be16(ncp_reply_data(server, offset));
 }
-static inline u32 DVAL_LH(void *data)
+static inline u32 DVAL_LH(const void *data)
 {
        return get_unaligned_le32(data);
 }
@@ -349,9 +349,9 @@ int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) {
        return result;
 }
-void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
+void ncp_extract_file_info(const void *structure, struct nw_info_struct *target)
 {
-        __u8 *name_len;
+        const __u8 *name_len;
        const int info_struct_size = offsetof(struct nw_info_struct, nameLen);
        memcpy(target, structure, info_struct_size);
@@ -364,7 +364,7 @@ void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
 }
 #ifdef CONFIG_NCPFS_NFS_NS
-static inline void ncp_extract_nfs_info(unsigned char *structure,
+static inline void ncp_extract_nfs_info(const unsigned char *structure,
                                 struct nw_nfs_info *target)
 {
        target->mode = DVAL_LH(structure);
@@ -417,7 +417,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
 * Returns information for a (one-component) name relative to
 * the specified directory.
 */
-int ncp_obtain_info(struct ncp_server *server, struct inode *dir, char *path,
+int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path,
                        struct nw_info_struct *target)
 {
        __u8  volnum = NCP_FINFO(dir)->volNumber;
@@ -452,16 +452,16 @@ out:
 #ifdef CONFIG_NCPFS_NFS_NS
 static int
 ncp_obtain_DOS_dir_base(struct ncp_server *server,
-                __u8 volnum, __le32 dirent,
+                __u8 ns, __u8 volnum, __le32 dirent,
-                char *path, /* At most 1 component */
+                const char *path, /* At most 1 component */
                __le32 *DOS_dir_base)
 {
        int result;
        ncp_init_request(server);
        ncp_add_byte(server, 6); /* subfunction */
-        ncp_add_byte(server, server->name_space[volnum]);
+        ncp_add_byte(server, ns);
-        ncp_add_byte(server, server->name_space[volnum]);
+        ncp_add_byte(server, ns);
        ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
        ncp_add_dword(server, RIM_DIRECTORY);
        ncp_add_handle_path(server, volnum, dirent, 1, path);
@@ -523,10 +523,27 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
 #endif  /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */
 }
+int
+ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
+{
+        int ns = ncp_get_known_namespace(server, volume);
+        if (ret_ns)
+                *ret_ns = ns;
+        DPRINTK("lookup_vol: namespace[%d] = %d\n",
+                volume, server->name_space[volume]);
+        if (server->name_space[volume] == ns)
+                return 0;
+        server->name_space[volume] = ns;
+        return 1;
+}
 static int
 ncp_ObtainSpecificDirBase(struct ncp_server *server,
                __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base,
-                char *path, /* At most 1 component */
+                const char *path, /* At most 1 component */
                __le32 *dirEntNum, __le32 *DosDirNum)
 {
        int result;
@@ -560,14 +577,13 @@ ncp_mount_subdir(struct ncp_server *server,
 {
        int dstNS;
        int result;
-        
-        dstNS = ncp_get_known_namespace(server, volNumber);
+        ncp_update_known_namespace(server, volNumber, &dstNS);
        if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, 
                                      dirEntNum, NULL, newDirEnt, newDosEnt)) != 0)
        {
                return result;
        }
-        server->name_space[volNumber] = dstNS;
        *volume = volNumber;
        server->m.mounted_vol[1] = 0;
        server->m.mounted_vol[0] = 'X';
@@ -575,11 +591,10 @@ ncp_mount_subdir(struct ncp_server *server,
 }
 int 
-ncp_get_volume_root(struct ncp_server *server, const char *volname,
+ncp_get_volume_root(struct ncp_server *server,
-                    __u32* volume, __le32* dirent, __le32* dosdirent)
+                    const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent)
 {
        int result;
-        __u8 volnum;
        DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
@@ -601,21 +616,14 @@ ncp_get_volume_root(struct ncp_server *server, const char *volname,
                return result;
        }
        *dirent = *dosdirent = ncp_reply_dword(server, 4);
-        volnum = ncp_reply_byte(server, 8);
+        *volume = ncp_reply_byte(server, 8);
        ncp_unlock_server(server);
-        *volume = volnum;
-        server->name_space[volnum] = ncp_get_known_namespace(server, volnum);
-        DPRINTK("lookup_vol: namespace[%d] = %d\n",
-                volnum, server->name_space[volnum]);
        return 0;
 }
 int
-ncp_lookup_volume(struct ncp_server *server, const char *volname,
+ncp_lookup_volume(struct ncp_server *server,
-                  struct nw_info_struct *target)
+                  const char *volname, struct nw_info_struct *target)
 {
        int result;
@@ -625,6 +633,7 @@ ncp_lookup_volume(struct ncp_server *server, const char *volname,
        if (result) {
                return result;
        }
+        ncp_update_known_namespace(server, target->volNumber, NULL);
        target->nameLen = strlen(volname);
        memcpy(target->entryName, volname, target->nameLen+1);
        target->attributes = aDIR;
@@ -676,8 +685,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
 {
        int result = 0;
+        ncp_init_request(server);
        if (server->name_space[volnum] == NW_NS_NFS) {
-                ncp_init_request(server);
                ncp_add_byte(server, 25);       /* subfunction */
                ncp_add_byte(server, server->name_space[volnum]);
                ncp_add_byte(server, NW_NS_NFS);
@@ -690,8 +699,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
                ncp_add_dword_lh(server, 1);    /* nlinks */
                ncp_add_dword_lh(server, rdev);
                result = ncp_request(server, 87);
-                ncp_unlock_server(server);
        }
+        ncp_unlock_server(server);
        return result;
 }
 #endif
@@ -700,7 +709,7 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
 static int
 ncp_DeleteNSEntry(struct ncp_server *server,
                  __u8 have_dir_base, __u8 volnum, __le32 dirent,
-                  char* name, __u8 ns, __le16 attr)
+                  const char* name, __u8 ns, __le16 attr)
 {
        int result;
@@ -734,23 +743,25 @@ ncp_del_file_or_subdir2(struct ncp_server *server,
 int
 ncp_del_file_or_subdir(struct ncp_server *server,
-                       struct inode *dir, char *name)
+                       struct inode *dir, const char *name)
 {
        __u8  volnum = NCP_FINFO(dir)->volNumber;
        __le32 dirent = NCP_FINFO(dir)->dirEntNum;
+        int name_space;
+        name_space = server->name_space[volnum];
 #ifdef CONFIG_NCPFS_NFS_NS
-        if (server->name_space[volnum]==NW_NS_NFS)
+        if (name_space == NW_NS_NFS)
        {
                int result;
 
-                result=ncp_obtain_DOS_dir_base(server, volnum, dirent, name, &dirent);
+                result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent);
                if (result) return result;
-                return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006));
+                name = NULL;
+                name_space = NW_NS_DOS;
        }
-        else
 #endif  /* CONFIG_NCPFS_NFS_NS */
-                return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, server->name_space[volnum], cpu_to_le16(0x8006));
+        return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006));
 }
 static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
@@ -765,7 +776,7 @@ static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
 /* If both dir and name are NULL, then in target there's already a
   looked-up entry that wants to be opened. */
 int ncp_open_create_file_or_subdir(struct ncp_server *server,
-                                   struct inode *dir, char *name,
+                                   struct inode *dir, const char *name,
                                   int open_create_mode,
                                   __le32 create_attributes,
                                   __le16 desired_acc_rights,
@@ -890,8 +901,8 @@ int ncp_search_for_fileset(struct ncp_server *server,
 static int
 ncp_RenameNSEntry(struct ncp_server *server,
-                  struct inode *old_dir, char *old_name, __le16 old_type,
+                  struct inode *old_dir, const char *old_name, __le16 old_type,
-                  struct inode *new_dir, char *new_name)
+                  struct inode *new_dir, const char *new_name)
 {
        int result = -EINVAL;
@@ -929,8 +940,8 @@ out:
 }
 int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
-                                struct inode *old_dir, char *old_name,
+                                struct inode *old_dir, const char *old_name,
-                                struct inode *new_dir, char *new_name)
+                                struct inode *new_dir, const char *new_name)
 {
        int result;
        __le16 old_type = cpu_to_le16(0x06);
@@ -958,7 +969,7 @@ int
 ncp_read_kernel(struct ncp_server *server, const char *file_id,
             __u32 offset, __u16 to_read, char *target, int *bytes_read)
 {
-        char *source;
+        const char *source;
        int result;
        ncp_init_request(server);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 2441d1ab57dc..09881e6aa5ad 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -32,8 +32,6 @@
 #include <linux/ctype.h>
 #endif /* CONFIG_NCPFS_NLS */
-#include <linux/ncp_fs.h>
 #define NCP_MIN_SYMLINK_SIZE    8
 #define NCP_MAX_SYMLINK_SIZE    512
@@ -65,10 +63,11 @@ static inline void ncp_inode_close(struct inode *inode) {
        atomic_dec(&NCP_FINFO(inode)->opened);
 }
-void ncp_extract_file_info(void* src, struct nw_info_struct* target);
+void ncp_extract_file_info(const void* src, struct nw_info_struct* target);
-int ncp_obtain_info(struct ncp_server *server, struct inode *, char *,
+int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *,
                struct nw_info_struct *target);
 int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target);
+int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns);
 int ncp_get_volume_root(struct ncp_server *server, const char *volname,
                        __u32 *volume, __le32 *dirent, __le32 *dosdirent);
 int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *);
@@ -80,8 +79,8 @@ int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent,
                        __u32 mode, __u32 rdev);
 int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*);
-int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, char *);
+int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *);
-int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, char *,
+int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *,
                                int, __le32, __le16, struct ncp_entry_info *);
 int ncp_initialize_search(struct ncp_server *, struct inode *,
@@ -93,7 +92,7 @@ int ncp_search_for_fileset(struct ncp_server *server,
                           char** rbuf, size_t* rsize);
 int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
-                              struct inode *, char *, struct inode *, char *);
+                              struct inode *, const char *, struct inode *, const char *);
 int
@@ -134,7 +133,7 @@ int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *,
                                const unsigned char *, unsigned int, int);
 #define NCP_ESC                 ':'
-#define NCP_IO_TABLE(dentry)    (NCP_SERVER((dentry)->d_inode)->nls_io)
+#define NCP_IO_TABLE(sb)        (NCP_SBP(sb)->nls_io)
 #define ncp_tolower(t, c)       nls_tolower(t, c)
 #define ncp_toupper(t, c)       nls_toupper(t, c)
 #define ncp_strnicmp(t, s1, s2, len) \
@@ -149,15 +148,15 @@ int ncp__io2vol(unsigned char *, unsigned int *,
 int ncp__vol2io(unsigned char *, unsigned int *,
                                const unsigned char *, unsigned int, int);
-#define NCP_IO_TABLE(dentry)    NULL
+#define NCP_IO_TABLE(sb)        NULL
 #define ncp_tolower(t, c)       tolower(c)
 #define ncp_toupper(t, c)       toupper(c)
 #define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U)
 #define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U)
-static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
+static inline int ncp_strnicmp(const struct nls_table *t,
-                const unsigned char *s2, int len)
+                const unsigned char *s1, const unsigned char *s2, int len)
 {
        while (len--) {
                if (tolower(*s1++) != tolower(*s2++))
@@ -170,13 +169,13 @@ static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
 #endif /* CONFIG_NCPFS_NLS */
 #define NCP_GET_AGE(dentry)     (jiffies - (dentry)->d_time)
-#define NCP_MAX_AGE(server)     ((server)->dentry_ttl)
+#define NCP_MAX_AGE(server)     atomic_read(&(server)->dentry_ttl)
 #define NCP_TEST_AGE(server,dentry)     (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server))
 static inline void
 ncp_age_dentry(struct ncp_server* server, struct dentry* dentry)
 {
-        dentry->d_time = jiffies - server->dentry_ttl;
+        dentry->d_time = jiffies - NCP_MAX_AGE(server);
 }
 static inline void
@@ -192,7 +191,7 @@ ncp_renew_dentries(struct dentry *parent)
        struct list_head *next;
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -204,7 +203,7 @@ ncp_renew_dentries(struct dentry *parent)
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 }
 static inline void
@@ -214,7 +213,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
        struct list_head *next;
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -222,7 +221,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
                ncp_age_dentry(server, dentry);
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 }
 struct ncp_cache_head {
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index 7c0b5c21e6cf..08907599dcd2 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -11,25 +11,26 @@
 #include <linux/string.h>
 #include <linux/ncp.h>
 #include <linux/bitops.h>
+#include "ncp_fs.h"
 #include "ncpsign_kernel.h"
 /* i386: 32-bit, little endian, handles mis-alignment */
 #ifdef __i386__
-#define GET_LE32(p) (*(int *)(p))
+#define GET_LE32(p) (*(const int *)(p))
 #define PUT_LE32(p,v) { *(int *)(p)=v; }
 #else
 /* from include/ncplib.h */
-#define BVAL(buf,pos) (((__u8 *)(buf))[pos])
+#define BVAL(buf,pos) (((const __u8 *)(buf))[pos])
 #define PVAL(buf,pos) ((unsigned)BVAL(buf,pos))
-#define BSET(buf,pos,val) (BVAL(buf,pos) = (val))
+#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val))
 static inline __u16
-WVAL_LH(__u8 * buf, int pos)
+WVAL_LH(const __u8 * buf, int pos)
 {
        return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8;
 }
 static inline __u32
-DVAL_LH(__u8 * buf, int pos)
+DVAL_LH(const __u8 * buf, int pos)
 {
        return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16;
 }
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
index 6451a68381cc..d9a1438bb1f6 100644
--- a/fs/ncpfs/ncpsign_kernel.h
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -8,8 +8,6 @@
 #ifndef _NCPSIGN_KERNEL_H
 #define _NCPSIGN_KERNEL_H
-#include <linux/ncp_fs.h>
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
 void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
 int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index c7ff6c700a6e..3a1587222c8a 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -28,7 +28,7 @@
 #include <linux/poll.h>
 #include <linux/file.h>
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
 #include "ncpsign_kernel.h"
@@ -746,7 +746,6 @@ static int ncp_do_request(struct ncp_server *server, int size,
                return -EIO;
        }
        if (!ncp_conn_valid(server)) {
-                printk(KERN_ERR "ncpfs: Connection invalid!\n");
                return -EIO;
        }
        {
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index c634fd17b337..661f861d80c6 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -25,13 +25,11 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/ncp_fs.h>
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 /* these magic numbers must appear in the symlink file -- this makes it a bit
   more resilient against the magic attributes being set on random files. */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f7e13db613cb..ba306658a6db 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -76,13 +76,17 @@ config NFS_V4
 config NFS_V4_1
        bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
-        depends on NFS_V4 && EXPERIMENTAL
+        depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+        select PNFS_FILE_LAYOUT
        help
          This option enables support for minor version 1 of the NFSv4 protocol
-          (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
+          (RFC 5661) in the kernel's NFS client.
          If unsure, say N.
+config PNFS_FILE_LAYOUT
+        tristate
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
@@ -117,3 +121,14 @@ config NFS_USE_KERNEL_DNS
        select DNS_RESOLVER
        select KEYS
        default y
+config NFS_USE_NEW_IDMAPPER
+        bool "Use the new idmapper upcall routine"
+        depends on NFS_V4 && KEYS
+        help
+          Say Y here if you want NFS to use the new idmapper upcall functions.
+          You will need /sbin/request-key (usually provided by the keyutils
+          package).  For details, read
+          <file:Documentation/filesystems/nfs/idmapper.txt>.
+          If you are unsure, say N.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index da7fda639eac..4776ff9e3814 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           delegation.o idmap.o \
                           callback.o callback_xdr.o callback_proc.o \
                           nfs4namespace.o
+nfs-$(CONFIG_NFS_V4_1)  += pnfs.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e17b49e2eabd..e3d294269058 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -9,7 +9,6 @@
 #include <linux/completion.h>
 #include <linux/ip.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
@@ -17,9 +16,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
-#if defined(CONFIG_NFS_V4_1)
 #include <linux/sunrpc/bc_xprt.h>
-#endif
 #include <net/inet_sock.h>
@@ -109,7 +106,7 @@ nfs4_callback_up(struct svc_serv *serv)
 {
        int ret;
-        ret = svc_create_xprt(serv, "tcp", PF_INET,
+        ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret <= 0)
                goto out_err;
@@ -117,7 +114,7 @@ nfs4_callback_up(struct svc_serv *serv)
        dprintk("NFS: Callback listener port = %u (af %u)\n",
                        nfs_callback_tcpport, PF_INET);
-        ret = svc_create_xprt(serv, "tcp", PF_INET6,
+        ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret > 0) {
                nfs_callback_tcpport6 = ret;
@@ -178,30 +175,38 @@ nfs41_callback_svc(void *vrqstp)
 struct svc_rqst *
 nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 {
-        struct svc_xprt *bc_xprt;
+        struct svc_rqst *rqstp;
-        struct svc_rqst *rqstp = ERR_PTR(-ENOMEM);
+        int ret;
-        dprintk("--> %s\n", __func__);
+        /*
-        /* Create a svc_sock for the service */
+         * Create an svc_sock for the back channel service that shares the
-        bc_xprt = svc_sock_create(serv, xprt->prot);
+         * fore channel connection.
-        if (!bc_xprt)
+         * Returns the input port (0) and sets the svc_serv bc_xprt on success
+         */
+        ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+                              SVC_SOCK_ANONYMOUS);
+        if (ret < 0) {
+                rqstp = ERR_PTR(ret);
                goto out;
+        }
        /*
         * Save the svc_serv in the transport so that it can
         * be referenced when the session backchannel is initialized
         */
-        serv->bc_xprt = bc_xprt;
        xprt->bc_serv = serv;
        INIT_LIST_HEAD(&serv->sv_cb_list);
        spin_lock_init(&serv->sv_cb_lock);
        init_waitqueue_head(&serv->sv_cb_waitq);
        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
-        if (IS_ERR(rqstp))
+        if (IS_ERR(rqstp)) {
-                svc_sock_destroy(bc_xprt);
+                svc_xprt_put(serv->sv_bc_xprt);
+                serv->sv_bc_xprt = NULL;
+        }
 out:
-        dprintk("--> %s return %p\n", __func__, rqstp);
+        dprintk("--> %s return %ld\n", __func__,
+                IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
        return rqstp;
 }
@@ -323,58 +328,58 @@ void nfs_callback_down(int minorversion)
        mutex_unlock(&nfs_callback_mutex);
 }
-static int check_gss_callback_principal(struct nfs_client *clp,
+/* Boolean check of RPC_AUTH_GSS principal */
-                                        struct svc_rqst *rqstp)
+int
+check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
 {
        struct rpc_clnt *r = clp->cl_rpcclient;
        char *p = svc_gss_principal(rqstp);
+        if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
+                return 1;
+        /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+        if (clp->cl_minorversion != 0)
+                return 0;
        /*
         * It might just be a normal user principal, in which case
         * userspace won't bother to tell us the name at all.
         */
        if (p == NULL)
-                return SVC_DENIED;
+                return 0;
        /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
        if (memcmp(p, "nfs@", 4) != 0)
-                return SVC_DENIED;
+                return 0;
        p += 4;
        if (strcmp(p, r->cl_server) != 0)
-                return SVC_DENIED;
+                return 0;
-        return SVC_OK;
+        return 1;
 }
+/*
+ * pg_authenticate method for nfsv4 callback threads.
+ *
+ * The authflavor has been negotiated, so an incorrect flavor is a server
+ * bug. Drop packets with incorrect authflavor.
+ *
+ * All other checking done after NFS decoding where the nfs_client can be
+ * found in nfs4_callback_compound
+ */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
-        struct nfs_client *clp;
-        RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-        int ret = SVC_OK;
-        /* Don't talk to strangers */
-        clp = nfs_find_client(svc_addr(rqstp), 4);
-        if (clp == NULL)
-                return SVC_DROP;
-        dprintk("%s: %s NFSv4 callback!\n", __func__,
-                        svc_print_addr(rqstp, buf, sizeof(buf)));
        switch (rqstp->rq_authop->flavour) {
-                case RPC_AUTH_NULL:
+        case RPC_AUTH_NULL:
-                        if (rqstp->rq_proc != CB_NULL)
+                if (rqstp->rq_proc != CB_NULL)
-                                ret = SVC_DENIED;
+                        return SVC_DROP;
-                        break;
+                break;
-                case RPC_AUTH_UNIX:
+        case RPC_AUTH_GSS:
-                        break;
+                /* No RPC_AUTH_GSS support yet in NFSv4.1 */
-                case RPC_AUTH_GSS:
+                 if (svc_is_backchannel(rqstp))
-                        ret = check_gss_callback_principal(clp, rqstp);
+                        return SVC_DROP;
-                        break;
-                default:
-                        ret = SVC_DENIED;
        }
-        nfs_put_client(clp);
+        return SVC_OK;
-        return ret;
 }
 /*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 85a7cfd1b8dd..46d93ce7311b 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -7,6 +7,7 @@
 */
 #ifndef __LINUX_FS_NFS_CALLBACK_H
 #define __LINUX_FS_NFS_CALLBACK_H
+#include <linux/sunrpc/svc.h>
 #define NFS4_CALLBACK 0x40000000
 #define NFS4_CALLBACK_XDRSIZE 2048
@@ -34,10 +35,16 @@ enum nfs4_callback_opnum {
        OP_CB_ILLEGAL = 10044,
 };
+struct cb_process_state {
+        __be32                  drc_status;
+        struct nfs_client       *clp;
+};
 struct cb_compound_hdr_arg {
        unsigned int taglen;
        const char *tag;
        unsigned int minorversion;
+        unsigned int cb_ident; /* v4.0 callback identifier */
        unsigned nops;
 };
@@ -103,14 +110,23 @@ struct cb_sequenceres {
        uint32_t                        csr_target_highestslotid;
 };
-extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-                                       struct cb_sequenceres *res);
+                                       struct cb_sequenceres *res,
+                                       struct cb_process_state *cps);
 extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
                                             const nfs4_stateid *stateid);
 #define RCA4_TYPE_MASK_RDATA_DLG        0
 #define RCA4_TYPE_MASK_WDATA_DLG        1
+#define RCA4_TYPE_MASK_DIR_DLG         2
+#define RCA4_TYPE_MASK_FILE_LAYOUT     3
+#define RCA4_TYPE_MASK_BLK_LAYOUT      4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN  8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
 struct cb_recallanyargs {
        struct sockaddr *craa_addr;
@@ -118,25 +134,52 @@ struct cb_recallanyargs {
        uint32_t        craa_type_mask;
 };
-extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
+                                        void *dummy,
+                                        struct cb_process_state *cps);
 struct cb_recallslotargs {
        struct sockaddr *crsa_addr;
        uint32_t        crsa_target_max_slots;
 };
-extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
+extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
-                                          void *dummy);
+                                         void *dummy,
+                                         struct cb_process_state *cps);
-#endif /* CONFIG_NFS_V4_1 */
+struct cb_layoutrecallargs {
+        struct sockaddr         *cbl_addr;
+        uint32_t                cbl_recall_type;
+        uint32_t                cbl_layout_type;
+        uint32_t                cbl_layoutchanged;
+        union {
+                struct {
+                        struct nfs_fh           cbl_fh;
+                        struct pnfs_layout_range cbl_range;
+                        nfs4_stateid            cbl_stateid;
+                };
+                struct nfs_fsid         cbl_fsid;
+        };
+};
-extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
+extern unsigned nfs4_callback_layoutrecall(
-extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+        struct cb_layoutrecallargs *args,
+        void *dummy, struct cb_process_state *cps);
+extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
+extern void nfs4_cb_take_slot(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4_1 */
+extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+                                    struct cb_getattrres *res,
+                                    struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+                                   struct cb_process_state *cps);
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
 extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
                                            const nfs4_stateid *stateid);
+extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4 */
 /*
 * nfs41: Callbacks are expected to not cause substantial latency,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 930d10fecdaf..89587573fe50 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,30 +12,33 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 #ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 #endif
- 
-__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+                             struct cb_getattrres *res,
+                             struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct nfs_delegation *delegation;
        struct nfs_inode *nfsi;
        struct inode *inode;
+        res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+                goto out;
        res->bitmap[0] = res->bitmap[1] = 0;
        res->status = htonl(NFS4ERR_BADHANDLE);
-        clp = nfs_find_client(args->addr, 4);
-        if (clp == NULL)
-                goto out;
        dprintk("NFS: GETATTR callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        inode = nfs_delegation_find_inode(clp, &args->fh);
+        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
        if (inode == NULL)
-                goto out_putclient;
+                goto out;
        nfsi = NFS_I(inode);
        rcu_read_lock();
        delegation = rcu_dereference(nfsi->delegation);
@@ -55,49 +58,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 out_iput:
        rcu_read_unlock();
        iput(inode);
-out_putclient:
-        nfs_put_client(clp);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
        return res->status;
 }
-__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+                            struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct inode *inode;
        __be32 res;
        
-        res = htonl(NFS4ERR_BADHANDLE);
+        res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->addr, 4);
+        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: RECALL callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        do {
+        res = htonl(NFS4ERR_BADHANDLE);
-                struct nfs_client *prev = clp;
+        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+        if (inode == NULL)
-                inode = nfs_delegation_find_inode(clp, &args->fh);
+                goto out;
-                if (inode != NULL) {
+        /* Set up a helper thread to actually return the delegation */
-                        /* Set up a helper thread to actually return the delegation */
+        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
-                        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+        case 0:
-                                case 0:
+                res = 0;
-                                        res = 0;
+                break;
-                                        break;
+        case -ENOENT:
-                                case -ENOENT:
+                if (res != 0)
-                                        if (res != 0)
+                        res = htonl(NFS4ERR_BAD_STATEID);
-                                                res = htonl(NFS4ERR_BAD_STATEID);
+                break;
-                                        break;
+        default:
-                                default:
+                res = htonl(NFS4ERR_RESOURCE);
-                                        res = htonl(NFS4ERR_RESOURCE);
+        }
-                        }
+        iput(inode);
-                        iput(inode);
-                }
-                clp = nfs_find_client_next(prev);
-                nfs_put_client(prev);
-        } while (clp != NULL);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
        return res;
@@ -113,16 +108,149 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
 #if defined(CONFIG_NFS_V4_1)
+static u32 initiate_file_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *ino;
+        bool found = false;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        LIST_HEAD(free_me_list);
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+                if (nfs_compare_fh(&args->cbl_fh,
+                                   &NFS_I(lo->plh_inode)->fh))
+                        continue;
+                ino = igrab(lo->plh_inode);
+                if (!ino)
+                        continue;
+                found = true;
+                /* Without this, layout can be freed as soon
+                 * as we release cl_lock.
+                 */
+                get_layout_hdr(lo);
+                break;
+        }
+        spin_unlock(&clp->cl_lock);
+        if (!found)
+                return NFS4ERR_NOMATCHING_LAYOUT;
+        spin_lock(&ino->i_lock);
+        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+            mark_matching_lsegs_invalid(lo, &free_me_list,
+                                        args->cbl_range.iomode))
+                rv = NFS4ERR_DELAY;
+        else
+                rv = NFS4ERR_NOMATCHING_LAYOUT;
+        pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&free_me_list);
+        put_layout_hdr(lo);
+        iput(ino);
+        return rv;
+}
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *ino;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        struct pnfs_layout_hdr *tmp;
+        LIST_HEAD(recall_list);
+        LIST_HEAD(free_me_list);
+        struct pnfs_layout_range range = {
+                .iomode = IOMODE_ANY,
+                .offset = 0,
+                .length = NFS4_MAX_UINT64,
+        };
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+                if ((args->cbl_recall_type == RETURN_FSID) &&
+                    memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+                           &args->cbl_fsid, sizeof(struct nfs_fsid)))
+                        continue;
+                if (!igrab(lo->plh_inode))
+                        continue;
+                get_layout_hdr(lo);
+                BUG_ON(!list_empty(&lo->plh_bulk_recall));
+                list_add(&lo->plh_bulk_recall, &recall_list);
+        }
+        spin_unlock(&clp->cl_lock);
+        list_for_each_entry_safe(lo, tmp,
+                                 &recall_list, plh_bulk_recall) {
+                ino = lo->plh_inode;
+                spin_lock(&ino->i_lock);
+                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+                        rv = NFS4ERR_DELAY;
+                list_del_init(&lo->plh_bulk_recall);
+                spin_unlock(&ino->i_lock);
+                put_layout_hdr(lo);
+                iput(ino);
+        }
+        pnfs_free_lseg_list(&free_me_list);
+        return rv;
+}
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+                                    struct cb_layoutrecallargs *args)
+{
+        u32 res = NFS4ERR_DELAY;
+        dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+        if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
+                goto out;
+        if (args->cbl_recall_type == RETURN_FILE)
+                res = initiate_file_draining(clp, args);
+        else
+                res = initiate_bulk_draining(clp, args);
+        clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
+out:
+        dprintk("%s returning %i\n", __func__, res);
+        return res;
+}
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+                                  void *dummy, struct cb_process_state *cps)
+{
+        u32 res;
+        dprintk("%s: -->\n", __func__);
+        if (cps->clp)
+                res = do_callback_layoutrecall(cps->clp, args);
+        else
+                res = NFS4ERR_OP_NOT_IN_SESSION;
+        dprintk("%s: exit with status = %d\n", __func__, res);
+        return cpu_to_be32(res);
+}
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+        struct cb_layoutrecallargs args;
+        /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+        memset(&args, 0, sizeof(args));
+        args.cbl_recall_type = RETURN_ALL;
+        /* FIXME we ignore errors, what should we do? */
+        do_callback_layoutrecall(clp, &args);
+}
 int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
 {
        if (delegation == NULL)
                return 0;
-        /* seqid is 4-bytes long */
+        if (stateid->stateid.seqid != 0)
-        if (((u32 *) &stateid->data)[0] != 0)
                return 0;
-        if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
+        if (memcmp(&delegation->stateid.stateid.other,
-                   sizeof(stateid->data)-4))
+                   &stateid->stateid.other,
+                   NFS4_STATEID_OTHER_SIZE))
                return 0;
        return 1;
@@ -185,42 +313,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 }
 /*
- * Returns a pointer to a held 'struct nfs_client' that matches the server's
- * address, major version number, and session ID.  It is the caller's
- * responsibility to release the returned reference.
- *
- * Returns NULL if there are no connections with sessions, or if no session
- * matches the one of interest.
- */
- static struct nfs_client *find_client_with_session(
-        const struct sockaddr *addr, u32 nfsversion,
-        struct nfs4_sessionid *sessionid)
-{
-        struct nfs_client *clp;
-        clp = nfs_find_client(addr, 4);
-        if (clp == NULL)
-                return NULL;
-        do {
-                struct nfs_client *prev = clp;
-                if (clp->cl_session != NULL) {
-                        if (memcmp(clp->cl_session->sess_id.data,
-                                        sessionid->data,
-                                        NFS4_MAX_SESSIONID_LEN) == 0) {
-                                /* Returns a held reference to clp */
-                                return clp;
-                        }
-                }
-                clp = nfs_find_client_next(prev);
-                nfs_put_client(prev);
-        } while (clp != NULL);
-        return NULL;
-}
-/*
 * For each referring call triple, check the session's slot table for
 * a match.  If the slot is in use and the sequence numbers match, the
 * client is still waiting for a response to the original request.
@@ -276,20 +368,28 @@ out:
 }
 __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-                                struct cb_sequenceres *res)
+                              struct cb_sequenceres *res,
+                              struct cb_process_state *cps)
 {
        struct nfs_client *clp;
        int i;
-        __be32 status;
+        __be32 status = htonl(NFS4ERR_BADSESSION);
-        status = htonl(NFS4ERR_BADSESSION);
+        cps->clp = NULL;
-        clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
+        clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
        if (clp == NULL)
                goto out;
+        /* state manager is resetting the session */
+        if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+                status = NFS4ERR_DELAY;
+                goto out;
+        }
        status = validate_seqid(&clp->cl_session->bc_slot_table, args);
        if (status)
-                goto out_putclient;
+                goto out;
        /*
         * Check for pending referring calls.  If a match is found, a
@@ -298,7 +398,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
         */
        if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
                status = htonl(NFS4ERR_DELAY);
-                goto out_putclient;
+                goto out;
        }
        memcpy(&res->csr_sessionid, &args->csa_sessionid,
@@ -307,83 +407,93 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        res->csr_slotid = args->csa_slotid;
        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+        nfs4_cb_take_slot(clp);
-out_putclient:
-        nfs_put_client(clp);
 out:
+        cps->clp = clp; /* put in nfs4_callback_compound */
        for (i = 0; i < args->csa_nrclists; i++)
                kfree(args->csa_rclists[i].rcl_refcalls);
        kfree(args->csa_rclists);
-        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
+        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-                res->csr_status = 0;
+                cps->drc_status = status;
-        else
+                status = 0;
+        } else
                res->csr_status = status;
        dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
                ntohl(status), ntohl(res->csr_status));
        return status;
 }
-__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+        return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+                               struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        __be32 status;
        fmode_t flags = 0;
-        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->craa_addr, 4);
+        if (!cps->clp) /* set in cb_sequence */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: RECALL_ANY callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+        status = cpu_to_be32(NFS4ERR_INVAL);
+        if (!validate_bitmap_values(args->craa_type_mask))
+                goto out;
+        status = cpu_to_be32(NFS4_OK);
        if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
                     &args->craa_type_mask))
                flags = FMODE_READ;
        if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
                     &args->craa_type_mask))
                flags |= FMODE_WRITE;
+        if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+                     &args->craa_type_mask))
+                pnfs_recall_all_layouts(cps->clp);
        if (flags)
-                nfs_expire_all_delegation_types(clp, flags);
+                nfs_expire_all_delegation_types(cps->clp, flags);
-        status = htonl(NFS4_OK);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
 }
 /* Reduce the fore channel's max_slots to the target value */
-__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+                                struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct nfs4_slot_table *fc_tbl;
        __be32 status;
        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->crsa_addr, 4);
+        if (!cps->clp) /* set in cb_sequence */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
                args->crsa_target_max_slots);
-        fc_tbl = &clp->cl_session->fc_slot_table;
+        fc_tbl = &cps->clp->cl_session->fc_slot_table;
        status = htonl(NFS4ERR_BAD_HIGH_SLOT);
        if (args->crsa_target_max_slots > fc_tbl->max_slots ||
            args->crsa_target_max_slots < 1)
-                goto out_putclient;
+                goto out;
        status = htonl(NFS4_OK);
        if (args->crsa_target_max_slots == fc_tbl->max_slots)
-                goto out_putclient;
+                goto out;
        fc_tbl->target_max_slots = args->crsa_target_max_slots;
-        nfs41_handle_recall_slot(clp);
+        nfs41_handle_recall_slot(cps->clp);
-out_putclient:
-        nfs_put_client(clp);    /* balance nfs_find_client */
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05af212f0edf..14e0f9371d14 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,8 +10,10 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include "nfs4_fs.h"
 #include "callback.h"
+#include "internal.h"
 #define CB_OP_TAGLEN_MAXSZ      (512)
 #define CB_OP_HDR_RES_MAXSZ     (2 + CB_OP_TAGLEN_MAXSZ)
@@ -22,6 +24,7 @@
 #define CB_OP_RECALL_RES_MAXSZ  (CB_OP_HDR_RES_MAXSZ)
 #if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ    (CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_SEQUENCE_RES_MAXSZ        (CB_OP_HDR_RES_MAXSZ + \
                                        4 + 1 + 3)
 #define CB_OP_RECALLANY_RES_MAXSZ       (CB_OP_HDR_RES_MAXSZ)
@@ -33,7 +36,8 @@
 /* Internal error code */
 #define NFS4ERR_RESOURCE_HDR    11050
-typedef __be32 (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *,
+                                        struct cb_process_state *);
 typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
 typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -160,7 +164,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        hdr->minorversion = ntohl(*p++);
        /* Check minor version is zero or one. */
        if (hdr->minorversion <= 1) {
-                p++;    /* skip callback_ident */
+                hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
        } else {
                printk(KERN_WARNING "%s: NFSv4 server callback with "
                        "illegal minor version %u!\n",
@@ -220,6 +224,66 @@ out:
 #if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       struct cb_layoutrecallargs *args)
+{
+        __be32 *p;
+        __be32 status = 0;
+        uint32_t iomode;
+        args->cbl_addr = svc_addr(rqstp);
+        p = read_buf(xdr, 4 * sizeof(uint32_t));
+        if (unlikely(p == NULL)) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        args->cbl_layout_type = ntohl(*p++);
+        /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+         * as it is unuseable and ignored with the other types.
+         */
+        iomode = ntohl(*p++);
+        args->cbl_layoutchanged = ntohl(*p++);
+        args->cbl_recall_type = ntohl(*p++);
+        if (args->cbl_recall_type == RETURN_FILE) {
+                args->cbl_range.iomode = iomode;
+                status = decode_fh(xdr, &args->cbl_fh);
+                if (unlikely(status != 0))
+                        goto out;
+                p = read_buf(xdr, 2 * sizeof(uint64_t));
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto out;
+                }
+                p = xdr_decode_hyper(p, &args->cbl_range.offset);
+                p = xdr_decode_hyper(p, &args->cbl_range.length);
+                status = decode_stateid(xdr, &args->cbl_stateid);
+                if (unlikely(status != 0))
+                        goto out;
+        } else if (args->cbl_recall_type == RETURN_FSID) {
+                p = read_buf(xdr, 2 * sizeof(uint64_t));
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto out;
+                }
+                p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+                p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+        } else if (args->cbl_recall_type != RETURN_ALL) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+                __func__,
+                args->cbl_layout_type, iomode,
+                args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        return status;
+}
 static __be32 decode_sessionid(struct xdr_stream *xdr,
                                 struct nfs4_sessionid *sid)
 {
@@ -574,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_SEQUENCE:
        case OP_CB_RECALL_ANY:
        case OP_CB_RECALL_SLOT:
+        case OP_CB_LAYOUTRECALL:
                *op = &callback_ops[op_nr];
                break;
-        case OP_CB_LAYOUTRECALL:
        case OP_CB_NOTIFY_DEVICEID:
        case OP_CB_NOTIFY:
        case OP_CB_PUSH_DELEG:
@@ -593,6 +657,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS_OK);
 }
+static void nfs4_callback_free_slot(struct nfs4_session *session)
+{
+        struct nfs4_slot_table *tbl = &session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        /*
+         * Let the state manager know callback processing done.
+         * A single slot, so highest used slotid is either 0 or -1
+         */
+        tbl->highest_used_slotid--;
+        nfs4_check_drain_bc_complete(session);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+        if (clp && clp->cl_session)
+                nfs4_callback_free_slot(clp->cl_session);
+}
+/* A single slot, so highest used slotid is either 0 or -1 */
+void nfs4_cb_take_slot(struct nfs_client *clp)
+{
+        struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        tbl->highest_used_slotid++;
+        BUG_ON(tbl->highest_used_slotid != 0);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
 #else /* CONFIG_NFS_V4_1 */
 static __be32
@@ -601,6 +696,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 static __be32
@@ -621,7 +719,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 static __be32 process_op(uint32_t minorversion, int nop,
                struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
-                struct xdr_stream *xdr_out, void *resp, int* drc_status)
+                struct xdr_stream *xdr_out, void *resp,
+                struct cb_process_state *cps)
 {
        struct callback_op *op = &callback_ops[0];
        unsigned int op_nr;
@@ -644,8 +743,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
        if (status)
                goto encode_hdr;
-        if (*drc_status) {
+        if (cps->drc_status) {
-                status = *drc_status;
+                status = cps->drc_status;
                goto encode_hdr;
        }
@@ -653,16 +752,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
        if (maxlen > 0 && maxlen < PAGE_SIZE) {
                status = op->decode_args(rqstp, xdr_in, argp);
                if (likely(status == 0))
-                        status = op->process_op(argp, resp);
+                        status = op->process_op(argp, resp, cps);
        } else
                status = htonl(NFS4ERR_RESOURCE);
-        /* Only set by OP_CB_SEQUENCE processing */
-        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-                *drc_status = status;
-                status = 0;
-        }
 encode_hdr:
        res = encode_op_hdr(xdr_out, op_nr, status);
        if (unlikely(res))
@@ -681,8 +774,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_compound_hdr_arg hdr_arg = { 0 };
        struct cb_compound_hdr_res hdr_res = { NULL };
        struct xdr_stream xdr_in, xdr_out;
-        __be32 *p;
+        __be32 *p, status;
-        __be32 status, drc_status = 0;
+        struct cb_process_state cps = {
+                .drc_status = 0,
+                .clp = NULL,
+        };
        unsigned int nops = 0;
        dprintk("%s: start\n", __func__);
@@ -696,6 +792,12 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        if (status == __constant_htonl(NFS4ERR_RESOURCE))
                return rpc_garbage_args;
+        if (hdr_arg.minorversion == 0) {
+                cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+                if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
+                        return rpc_drop_reply;
+        }
        hdr_res.taglen = hdr_arg.taglen;
        hdr_res.tag = hdr_arg.tag;
        if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
@@ -703,7 +805,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        while (status == 0 && nops != hdr_arg.nops) {
                status = process_op(hdr_arg.minorversion, nops, rqstp,
-                                    &xdr_in, argp, &xdr_out, resp, &drc_status);
+                                    &xdr_in, argp, &xdr_out, resp, &cps);
                nops++;
        }
@@ -716,6 +818,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        *hdr_res.status = status;
        *hdr_res.nops = htonl(nops);
+        nfs4_cb_free_slot(cps.clp);
+        nfs_put_client(cps.clp);
        dprintk("%s: done, status = %u\n", __func__, ntohl(status));
        return rpc_success;
 }
@@ -739,6 +843,12 @@ static struct callback_op callback_ops[] = {
                .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
        },
 #if defined(CONFIG_NFS_V4_1)
+        [OP_CB_LAYOUTRECALL] = {
+                .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+                .decode_args =
+                        (callback_decode_arg_t)decode_layoutrecall_args,
+                .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+        },
        [OP_CB_SEQUENCE] = {
                .process_op = (callback_process_op_t)nfs4_callback_sequence,
                .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e7340729af89..bd3ca32879e7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_CLIENT
@@ -55,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock);
 static LIST_HEAD(nfs_client_list);
 static LIST_HEAD(nfs_volume_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+#ifdef CONFIG_NFS_V4
+static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+        int ret = 0;
+        if (clp->rpc_ops->version != 4 || minorversion != 0)
+                return ret;
+retry:
+        if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+                return -ENOMEM;
+        spin_lock(&nfs_client_lock);
+        ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
+        spin_unlock(&nfs_client_lock);
+        if (ret == -EAGAIN)
+                goto retry;
+        return ret;
+}
+#endif /* CONFIG_NFS_V4 */
 /*
 * RPC cruft for NFS
@@ -143,7 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_proto = cl_init->proto;
 #ifdef CONFIG_NFS_V4
-        INIT_LIST_HEAD(&clp->cl_delegations);
+        err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+        if (err)
+                goto error_cleanup;
        spin_lock_init(&clp->cl_lock);
        INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -155,7 +183,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        cred = rpc_lookup_machine_cred();
        if (!IS_ERR(cred))
                clp->cl_machine_cred = cred;
+#if defined(CONFIG_NFS_V4_1)
+        INIT_LIST_HEAD(&clp->cl_layouts);
+#endif
        nfs_fscache_get_client_cookie(clp);
        return clp;
@@ -167,21 +197,17 @@ error_0:
 }
 #ifdef CONFIG_NFS_V4
-/*
- * Clears/puts all minor version specific parts from an nfs_client struct
- * reverting it to minorversion 0.
- */
-static void nfs4_clear_client_minor_version(struct nfs_client *clp)
-{
 #ifdef CONFIG_NFS_V4_1
-        if (nfs4_has_session(clp)) {
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+        if (nfs4_has_session(clp))
                nfs4_destroy_session(clp->cl_session);
-                clp->cl_session = NULL;
-        }
-        clp->cl_mvops = nfs_v4_minor_ops[0];
-#endif /* CONFIG_NFS_V4_1 */
 }
+#else /* CONFIG_NFS_V4_1 */
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * Destroy the NFS4 callback service
@@ -196,17 +222,49 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 {
        if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
                nfs4_kill_renewd(clp);
-        nfs4_clear_client_minor_version(clp);
+        nfs4_shutdown_session(clp);
        nfs4_destroy_callback(clp);
        if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
                nfs_idmap_delete(clp);
        rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
 }
+/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
+void nfs_cleanup_cb_ident_idr(void)
+{
+        idr_destroy(&cb_ident_idr);
+}
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+        if (clp->cl_cb_ident)
+                idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+}
+static void pnfs_init_server(struct nfs_server *server)
+{
+        rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
 }
+void nfs_cleanup_cb_ident_idr(void)
+{
+}
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
 #endif /* CONFIG_NFS_V4 */
 /*
@@ -245,6 +303,7 @@ void nfs_put_client(struct nfs_client *clp)
        if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
                list_del(&clp->cl_share_link);
+                nfs_cb_idr_remove_locked(clp);
                spin_unlock(&nfs_client_lock);
                BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -252,6 +311,7 @@ void nfs_put_client(struct nfs_client *clp)
                nfs_free_client(clp);
        }
 }
+EXPORT_SYMBOL_GPL(nfs_put_client);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /*
@@ -359,70 +419,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
        return 0;
 }
-/*
+/* Common match routine for v4.0 and v4.1 callback services */
- * Find a client by IP address and protocol version
+bool
- * - returns NULL if no such client
+nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
- */
+                     u32 minorversion)
-struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
 {
-        struct nfs_client *clp;
+        struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-        spin_lock(&nfs_client_lock);
-        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
-                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-                /* Don't match clients that failed to initialise properly */
+        /* Don't match clients that failed to initialise */
-                if (!(clp->cl_cons_state == NFS_CS_READY ||
+        if (!(clp->cl_cons_state == NFS_CS_READY ||
-                      clp->cl_cons_state == NFS_CS_SESSION_INITING))
+            clp->cl_cons_state == NFS_CS_SESSION_INITING))
-                        continue;
+                return false;
-                /* Different NFS versions cannot share the same nfs_client */
+        /* Match the version and minorversion */
-                if (clp->rpc_ops->version != nfsversion)
+        if (clp->rpc_ops->version != 4 ||
-                        continue;
+            clp->cl_minorversion != minorversion)
+                return false;
-                /* Match only the IP address, not the port number */
+        /* Match only the IP address, not the port number */
-                if (!nfs_sockaddr_match_ipaddr(addr, clap))
+        if (!nfs_sockaddr_match_ipaddr(addr, clap))
-                        continue;
+                return false;
-                atomic_inc(&clp->cl_count);
+        return true;
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
-}
-/*
- * Find a client by IP address and protocol version
- * - returns NULL if no such client
- */
-struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
-{
-        struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
-        u32 nfsvers = clp->rpc_ops->version;
-        spin_lock(&nfs_client_lock);
-        list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
-                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-                /* Don't match clients that failed to initialise properly */
-                if (clp->cl_cons_state != NFS_CS_READY)
-                        continue;
-                /* Different NFS versions cannot share the same nfs_client */
-                if (clp->rpc_ops->version != nfsvers)
-                        continue;
-                /* Match only the IP address, not the port number */
-                if (!nfs_sockaddr_match_ipaddr(sap, clap))
-                        continue;
-                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
 }
 /*
@@ -601,6 +619,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 {
        struct rpc_clnt         *clnt = NULL;
        struct rpc_create_args args = {
+                .net            = &init_net,
                .protocol       = clp->cl_proto,
                .address        = (struct sockaddr *)&clp->cl_addr,
                .addrsize       = clp->cl_addrlen,
@@ -635,7 +654,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 */
 static void nfs_destroy_server(struct nfs_server *server)
 {
-        if (!(server->flags & NFS_MOUNT_NONLM))
+        if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
+                        !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
                nlmclnt_done(server->nlm_host);
 }
@@ -657,7 +677,8 @@ static int nfs_start_lockd(struct nfs_server *server)
        if (nlm_init.nfs_version > 3)
                return 0;
-        if (server->flags & NFS_MOUNT_NONLM)
+        if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
+                        (server->flags & NFS_MOUNT_LOCAL_FCNTL))
                return 0;
        switch (clp->cl_proto) {
@@ -898,11 +919,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
                server->wsize = NFS_MAX_FILE_IO_SIZE;
        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        set_pnfs_layoutdriver(server, fsinfo->layouttype);
        server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
        server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
-        if (server->dtsize > PAGE_CACHE_SIZE)
+        if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
-                server->dtsize = PAGE_CACHE_SIZE;
+                server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
        if (server->dtsize > server->rsize)
                server->dtsize = server->rsize;
@@ -913,6 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
        server->maxfilesize = fsinfo->maxfilesize;
+        server->time_delta = fsinfo->time_delta;
        /* We're airborne Set socket buffersize */
        rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
 }
@@ -935,6 +960,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
        }
        fsinfo.fattr = fattr;
+        fsinfo.layouttype = 0;
        error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
        if (error < 0)
                goto out_error;
@@ -976,6 +1002,27 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
        target->options = source->options;
 }
+static void nfs_server_insert_lists(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        spin_lock(&nfs_client_lock);
+        list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+        list_add_tail(&server->master_link, &nfs_volume_list);
+        spin_unlock(&nfs_client_lock);
+}
+static void nfs_server_remove_lists(struct nfs_server *server)
+{
+        spin_lock(&nfs_client_lock);
+        list_del_rcu(&server->client_link);
+        list_del(&server->master_link);
+        spin_unlock(&nfs_client_lock);
+        synchronize_rcu();
+}
 /*
 * Allocate and initialise a server record
 */
@@ -992,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void)
        /* Zero out the NFS state stuff */
        INIT_LIST_HEAD(&server->client_link);
        INIT_LIST_HEAD(&server->master_link);
+        INIT_LIST_HEAD(&server->delegations);
        atomic_set(&server->active, 0);
@@ -1007,6 +1055,8 @@ static struct nfs_server *nfs_alloc_server(void)
                return NULL;
        }
+        pnfs_init_server(server);
        return server;
 }
@@ -1017,10 +1067,8 @@ void nfs_free_server(struct nfs_server *server)
 {
        dprintk("--> nfs_free_server()\n");
-        spin_lock(&nfs_client_lock);
+        nfs_server_remove_lists(server);
-        list_del(&server->client_link);
+        unset_pnfs_layoutdriver(server);
-        list_del(&server->master_link);
-        spin_unlock(&nfs_client_lock);
        if (server->destroy != NULL)
                server->destroy(server);
@@ -1095,11 +1143,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                (unsigned long long) server->fsid.major,
                (unsigned long long) server->fsid.minor);
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
        nfs_free_fattr(fattr);
        return server;
@@ -1112,6 +1156,96 @@ error:
 #ifdef CONFIG_NFS_V4
 /*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by IP address, protocol version, and minorversion
+ *
+ * Called from the pg_authenticate method. The callback identifier
+ * is not used as it has not been decoded.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_no_ident(const struct sockaddr *addr)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                if (nfs4_cb_match_client(addr, clp, 0) == false)
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nfs_client_lock);
+        return NULL;
+}
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(int cb_ident)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        clp = idr_find(&cb_ident_idr, cb_ident);
+        if (clp)
+                atomic_inc(&clp->cl_count);
+        spin_unlock(&nfs_client_lock);
+        return clp;
+}
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+                           struct nfs4_sessionid *sid)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                if (nfs4_cb_match_client(addr, clp, 1) == false)
+                        continue;
+                if (!nfs4_has_session(clp))
+                        continue;
+                /* Match sessionid*/
+                if (memcmp(clp->cl_session->sess_id.data,
+                    sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nfs_client_lock);
+        return NULL;
+}
+#else /* CONFIG_NFS_V4_1 */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+                           struct nfs4_sessionid *sid)
+{
+        return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+/*
 * Initialize the NFS4 callback service
 */
 static int nfs4_init_callback(struct nfs_client *clp)
@@ -1329,11 +1463,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
        if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
                server->namelen = NFS4_MAXNAMLEN;
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
 out:
        nfs_free_fattr(fattr);
@@ -1356,8 +1486,9 @@ static int nfs4_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
-        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
+        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
-                NFS_CAP_POSIX_LOCK;
+        if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+                        server->caps |= NFS_CAP_READDIRPLUS;
        server->options = data->options;
        /* Get a client record */
@@ -1537,11 +1668,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        if (error < 0)
                goto out_free_server;
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
        nfs_free_fattr(fattr_fsinfo);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index b9c3c43cea1d..bbbc6bf5cb2e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/nfs4.h>
@@ -24,8 +23,6 @@
 static void nfs_do_free_delegation(struct nfs_delegation *delegation)
 {
-        if (delegation->cred)
-                put_rpccred(delegation->cred);
        kfree(delegation);
 }
@@ -38,14 +35,30 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
+        if (delegation->cred) {
+                put_rpccred(delegation->cred);
+                delegation->cred = NULL;
+        }
        call_rcu(&delegation->rcu, nfs_free_delegation_callback);
 }
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
 {
        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
+/**
+ * nfs_have_delegation - check if inode has a delegation
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
 int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
        struct nfs_delegation *delegation;
@@ -71,20 +84,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
        if (inode->i_flock == NULL)
                goto out;
-        /* Protect inode->i_flock using the BKL */
+        /* Protect inode->i_flock using the file locks lock */
-        lock_kernel();
+        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file) != ctx)
                        continue;
-                unlock_kernel();
+                unlock_flocks();
                status = nfs4_lock_delegation_recall(state, fl);
                if (status < 0)
                        goto out;
-                lock_kernel();
+                lock_flocks();
        }
-        unlock_kernel();
+        unlock_flocks();
 out:
        return status;
 }
@@ -120,10 +133,15 @@ again:
        return 0;
 }
-/*
+/**
- * Set up a delegation on an inode
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @res: new delegation state from server
+ *
 */
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+                                  struct nfs_openres *res)
 {
        struct nfs_delegation *delegation;
        struct rpc_cred *oldcred = NULL;
@@ -176,38 +194,52 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
        return inode;
 }
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+static struct nfs_delegation *
-                                                           const nfs4_stateid *stateid,
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
-                                                           struct nfs_client *clp)
+                             struct nfs_server *server)
 {
        struct nfs_delegation *delegation =
                rcu_dereference_protected(nfsi->delegation,
-                                          lockdep_is_held(&clp->cl_lock));
+                                lockdep_is_held(&server->nfs_client->cl_lock));
        if (delegation == NULL)
                goto nomatch;
        spin_lock(&delegation->lock);
-        if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
-                                sizeof(delegation->stateid.data)) != 0)
-                goto nomatch_unlock;
        list_del_rcu(&delegation->super_list);
        delegation->inode = NULL;
        nfsi->delegation_state = 0;
        rcu_assign_pointer(nfsi->delegation, NULL);
        spin_unlock(&delegation->lock);
        return delegation;
-nomatch_unlock:
-        spin_unlock(&delegation->lock);
 nomatch:
        return NULL;
 }
-/*
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
- * Set up a delegation on an inode
+                                                    struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs_delegation *delegation;
+        spin_lock(&clp->cl_lock);
+        delegation = nfs_detach_delegation_locked(nfsi, server);
+        spin_unlock(&clp->cl_lock);
+        return delegation;
+}
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @res: new delegation state from server
+ *
+ * Returns zero on success, or a negative errno value.
 */
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation, *old_delegation;
        struct nfs_delegation *freeme = NULL;
@@ -228,7 +260,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        spin_lock(&clp->cl_lock);
        old_delegation = rcu_dereference_protected(nfsi->delegation,
-                                                   lockdep_is_held(&clp->cl_lock));
+                                        lockdep_is_held(&clp->cl_lock));
        if (old_delegation != NULL) {
                if (memcmp(&delegation->stateid, &old_delegation->stateid,
                                        sizeof(old_delegation->stateid)) == 0 &&
@@ -247,9 +279,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                        delegation = NULL;
                        goto out;
                }
-                freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
+                freeme = nfs_detach_delegation_locked(nfsi, server);
        }
-        list_add_rcu(&delegation->super_list, &clp->cl_delegations);
+        list_add_rcu(&delegation->super_list, &server->delegations);
        nfsi->delegation_state = delegation->type;
        rcu_assign_pointer(nfsi->delegation, delegation);
        delegation = NULL;
@@ -291,73 +323,85 @@ out:
        return err;
 }
-/*
+/**
- * Return all delegations that have been marked for return
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Returns zero on success, or a negative errno value.
 */
 int nfs_client_return_marked_delegations(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        struct inode *inode;
        int err = 0;
 restart:
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+                list_for_each_entry_rcu(delegation, &server->delegations,
-                        continue;
+                                                                super_list) {
-                inode = nfs_delegation_grab_inode(delegation);
+                        if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
-                if (inode == NULL)
+                                                        &delegation->flags))
-                        continue;
+                                continue;
-                spin_lock(&clp->cl_lock);
+                        inode = nfs_delegation_grab_inode(delegation);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
+                        if (inode == NULL)
-                spin_unlock(&clp->cl_lock);
+                                continue;
-                rcu_read_unlock();
+                        delegation = nfs_detach_delegation(NFS_I(inode),
-                if (delegation != NULL) {
+                                                                server);
-                        filemap_flush(inode->i_mapping);
+                        rcu_read_unlock();
-                        err = __nfs_inode_return_delegation(inode, delegation, 0);
+                        if (delegation != NULL) {
+                                filemap_flush(inode->i_mapping);
+                                err = __nfs_inode_return_delegation(inode,
+                                                                delegation, 0);
+                        }
+                        iput(inode);
+                        if (!err)
+                                goto restart;
+                        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+                        return err;
                }
-                iput(inode);
-                if (!err)
-                        goto restart;
-                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                return err;
        }
        rcu_read_unlock();
        return 0;
 }
-/*
+/**
- * This function returns the delegation without reclaiming opens
+ * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
- * or protecting against delegation reclaims.
+ * @inode: inode to process
- * It is therefore really only safe to be called from
+ *
- * nfs4_clear_inode()
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode().
 */
 void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        if (rcu_access_pointer(nfsi->delegation) != NULL) {
-                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation(nfsi, server);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-                spin_unlock(&clp->cl_lock);
                if (delegation != NULL)
                        nfs_do_return_delegation(inode, delegation, 0);
        }
 }
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
 int nfs_inode_return_delegation(struct inode *inode)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        int err = 0;
        if (rcu_access_pointer(nfsi->delegation) != NULL) {
-                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation(nfsi, server);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-                spin_unlock(&clp->cl_lock);
                if (delegation != NULL) {
                        nfs_wb_all(inode);
                        err = __nfs_inode_return_delegation(inode, delegation, 1);
@@ -366,46 +410,61 @@ int nfs_inode_return_delegation(struct inode *inode)
        return err;
 }
-static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
 {
+        struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
 }
-/*
+/**
- * Return all delegations associated to a super block
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @sb: sb to process
+ *
 */
 void nfs_super_return_all_delegations(struct super_block *sb)
 {
-        struct nfs_client *clp = NFS_SB(sb)->nfs_client;
+        struct nfs_server *server = NFS_SB(sb);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_delegation *delegation;
        if (clp == NULL)
                return;
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                spin_lock(&delegation->lock);
-                if (delegation->inode != NULL && delegation->inode->i_sb == sb)
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
                spin_unlock(&delegation->lock);
        }
        rcu_read_unlock();
        if (nfs_client_return_marked_delegations(clp) != 0)
                nfs4_schedule_state_manager(clp);
 }
-static
+static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
-void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+                                                 fmode_t flags)
 {
        struct nfs_delegation *delegation;
-        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
                        continue;
                if (delegation->type & flags)
-                        nfs_mark_return_delegation(clp, delegation);
+                        nfs_mark_return_delegation(delegation);
        }
+}
+static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+                                                        fmode_t flags)
+{
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs_mark_return_all_delegation_types(server, flags);
        rcu_read_unlock();
 }
@@ -420,19 +479,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp)
                nfs4_schedule_state_manager(clp);
 }
+/**
+ * nfs_expire_all_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
 void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
        nfs_client_mark_return_all_delegation_types(clp, flags);
        nfs_delegation_run_state_manager(clp);
 }
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
 void nfs_expire_all_delegations(struct nfs_client *clp)
 {
        nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
 }
-/*
+/**
- * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
+ * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
 */
 void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
@@ -441,29 +513,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
        nfs_client_mark_return_all_delegations(clp);
 }
-static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
 {
        struct nfs_delegation *delegation;
-        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
                        continue;
-                nfs_mark_return_delegation(clp, delegation);
+                nfs_mark_return_delegation(delegation);
        }
-        rcu_read_unlock();
 }
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 {
-        nfs_client_mark_return_unreferenced_delegations(clp);
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs_mark_return_unreferenced_delegations(server);
+        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
 }
-/*
+/**
- * Asynchronous delegation recall!
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information from CB_RECALL arguments
+ *
+ * Returns zero on success, or a negative errno value.
 */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
+int nfs_async_inode_return_delegation(struct inode *inode,
+                                      const nfs4_stateid *stateid)
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_delegation *delegation;
@@ -475,22 +561,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
                rcu_read_unlock();
                return -ENOENT;
        }
+        nfs_mark_return_delegation(delegation);
-        nfs_mark_return_delegation(clp, delegation);
        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
        return 0;
 }
-/*
+static struct inode *
- * Retrieve the inode associated with a delegation
+nfs_delegation_find_inode_server(struct nfs_server *server,
- */
+                                 const struct nfs_fh *fhandle)
-struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
 {
        struct nfs_delegation *delegation;
        struct inode *res = NULL;
-        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                spin_lock(&delegation->lock);
                if (delegation->inode != NULL &&
                    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
@@ -500,49 +585,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
                if (res != NULL)
                        break;
        }
+        return res;
+}
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+                                        const struct nfs_fh *fhandle)
+{
+        struct nfs_server *server;
+        struct inode *res = NULL;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                res = nfs_delegation_find_inode_server(server, fhandle);
+                if (res != NULL)
+                        break;
+        }
        rcu_read_unlock();
        return res;
 }
-/*
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
- * Mark all delegations as needing to be reclaimed
+{
+        struct nfs_delegation *delegation;
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+}
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
 */
 void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
-                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+                nfs_delegation_mark_reclaim_server(server);
        rcu_read_unlock();
 }
-/*
+/**
- * Reap all unclaimed delegations after reboot recovery is done
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
 */
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        struct inode *inode;
 restart:
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
+                list_for_each_entry_rcu(delegation, &server->delegations,
-                        continue;
+                                                                super_list) {
-                inode = nfs_delegation_grab_inode(delegation);
+                        if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
-                if (inode == NULL)
+                                                &delegation->flags) == 0)
-                        continue;
+                                continue;
-                spin_lock(&clp->cl_lock);
+                        inode = nfs_delegation_grab_inode(delegation);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
+                        if (inode == NULL)
-                spin_unlock(&clp->cl_lock);
+                                continue;
-                rcu_read_unlock();
+                        delegation = nfs_detach_delegation(NFS_I(inode),
-                if (delegation != NULL)
+                                                                server);
-                        nfs_free_delegation(delegation);
+                        rcu_read_unlock();
-                iput(inode);
-                goto restart;
+                        if (delegation != NULL)
+                                nfs_free_delegation(delegation);
+                        iput(inode);
+                        goto restart;
+                }
        }
        rcu_read_unlock();
 }
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+        struct nfs_server *server;
+        int ret = 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                if (!list_empty(&server->delegations)) {
+                        ret = 1;
+                        break;
+                }
+        rcu_read_unlock();
+        return ret;
+}
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @dst: stateid data structure to fill in
+ * @inode: inode to check
+ *
+ * Returns one and fills in "dst->data" * if inode had a delegation,
+ * otherwise zero is returned.
+ */
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2026304bda19..d9322e490c56 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e257172d438c..2c3eb33b904d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,11 +33,13 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/kmemleak.h>
+#include <linux/xattr.h>
-#include "nfs4_fs.h"
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 /* #define NFS_DEBUG_VERBOSE 1 */
@@ -55,6 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
                      struct inode *, struct dentry *);
 static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static void nfs_readdir_clear_array(struct page*);
 const struct file_operations nfs_dir_operations = {
        .llseek         = nfs_llseek_dir,
@@ -80,6 +83,10 @@ const struct inode_operations nfs_dir_inode_operations = {
        .setattr        = nfs_setattr,
 };
+const struct address_space_operations nfs_dir_aops = {
+        .freepage = nfs_readdir_clear_array,
+};
 #ifdef CONFIG_NFS_V3
 const struct inode_operations nfs3_dir_inode_operations = {
        .create         = nfs_create,
@@ -104,8 +111,9 @@ const struct inode_operations nfs3_dir_inode_operations = {
 #ifdef CONFIG_NFS_V4
 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
 const struct inode_operations nfs4_dir_inode_operations = {
-        .create         = nfs_create,
+        .create         = nfs_open_create,
        .lookup         = nfs_atomic_lookup,
        .link           = nfs_link,
        .unlink         = nfs_unlink,
@@ -117,9 +125,10 @@ const struct inode_operations nfs4_dir_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .getxattr       = nfs4_getxattr,
+        .getxattr       = generic_getxattr,
-        .setxattr       = nfs4_setxattr,
+        .setxattr       = generic_setxattr,
-        .listxattr      = nfs4_listxattr,
+        .listxattr      = generic_listxattr,
+        .removexattr    = generic_removexattr,
 };
 #endif /* CONFIG_NFS_V4 */
@@ -150,51 +159,209 @@ nfs_opendir(struct inode *inode, struct file *filp)
        return res;
 }
-typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
+struct nfs_cache_array_entry {
+        u64 cookie;
+        u64 ino;
+        struct qstr string;
+        unsigned char d_type;
+};
+struct nfs_cache_array {
+        unsigned int size;
+        int eof_index;
+        u64 last_cookie;
+        struct nfs_cache_array_entry array[0];
+};
+typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
        unsigned long   page_index;
-        __be32          *ptr;
        u64             *dir_cookie;
+        u64             last_cookie;
        loff_t          current_index;
-        struct nfs_entry *entry;
        decode_dirent_t decode;
-        int             plus;
        unsigned long   timestamp;
        unsigned long   gencount;
-        int             timestamp_valid;
+        unsigned int    cache_entry_index;
+        unsigned int    plus:1;
+        unsigned int    eof:1;
 } nfs_readdir_descriptor_t;
-/* Now we cache directories properly, by stuffing the dirent
+/*
- * data directly in the page cache.
+ * The caller is responsible for calling nfs_readdir_release_array(page)
- *
- * Inode invalidation due to refresh etc. takes care of
- * _everything_, no sloppy entry flushing logic, no extraneous
- * copying, network direct to page cache, the way it was meant
- * to be.
- *
- * NOTE: Dirent information verification is done always by the
- *       page-in of the RPC reply, nowhere else, this simplies
- *       things substantially.
 */
 static
-int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
+struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
+{
+        void *ptr;
+        if (page == NULL)
+                return ERR_PTR(-EIO);
+        ptr = kmap(page);
+        if (ptr == NULL)
+                return ERR_PTR(-ENOMEM);
+        return ptr;
+}
+static
+void nfs_readdir_release_array(struct page *page)
+{
+        kunmap(page);
+}
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static
+void nfs_readdir_clear_array(struct page *page)
+{
+        struct nfs_cache_array *array;
+        int i;
+        array = kmap_atomic(page, KM_USER0);
+        for (i = 0; i < array->size; i++)
+                kfree(array->array[i].string.name);
+        kunmap_atomic(array, KM_USER0);
+}
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static
+int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+{
+        string->len = len;
+        string->name = kmemdup(name, len, GFP_KERNEL);
+        if (string->name == NULL)
+                return -ENOMEM;
+        /*
+         * Avoid a kmemleak false positive. The pointer to the name is stored
+         * in a page cache page which kmemleak does not scan.
+         */
+        kmemleak_not_leak(string->name);
+        string->hash = full_name_hash(name, len);
+        return 0;
+}
+static
+int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+{
+        struct nfs_cache_array *array = nfs_readdir_get_array(page);
+        struct nfs_cache_array_entry *cache_entry;
+        int ret;
+        if (IS_ERR(array))
+                return PTR_ERR(array);
+        cache_entry = &array->array[array->size];
+        /* Check that this entry lies within the page bounds */
+        ret = -ENOSPC;
+        if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+                goto out;
+        cache_entry->cookie = entry->prev_cookie;
+        cache_entry->ino = entry->ino;
+        cache_entry->d_type = entry->d_type;
+        ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+        if (ret)
+                goto out;
+        array->last_cookie = entry->cookie;
+        array->size++;
+        if (entry->eof != 0)
+                array->eof_index = array->size;
+out:
+        nfs_readdir_release_array(page);
+        return ret;
+}
+static
+int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+        loff_t diff = desc->file->f_pos - desc->current_index;
+        unsigned int index;
+        if (diff < 0)
+                goto out_eof;
+        if (diff >= array->size) {
+                if (array->eof_index >= 0)
+                        goto out_eof;
+                desc->current_index += array->size;
+                return -EAGAIN;
+        }
+        index = (unsigned int)diff;
+        *desc->dir_cookie = array->array[index].cookie;
+        desc->cache_entry_index = index;
+        return 0;
+out_eof:
+        desc->eof = 1;
+        return -EBADCOOKIE;
+}
+static
+int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+        int i;
+        int status = -EAGAIN;
+        for (i = 0; i < array->size; i++) {
+                if (array->array[i].cookie == *desc->dir_cookie) {
+                        desc->cache_entry_index = i;
+                        return 0;
+                }
+        }
+        if (array->eof_index >= 0) {
+                status = -EBADCOOKIE;
+                if (*desc->dir_cookie == array->last_cookie)
+                        desc->eof = 1;
+        }
+        return status;
+}
+static
+int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+{
+        struct nfs_cache_array *array;
+        int status;
+        array = nfs_readdir_get_array(desc->page);
+        if (IS_ERR(array)) {
+                status = PTR_ERR(array);
+                goto out;
+        }
+        if (*desc->dir_cookie == 0)
+                status = nfs_readdir_search_for_pos(array, desc);
+        else
+                status = nfs_readdir_search_for_cookie(array, desc);
+        if (status == -EAGAIN) {
+                desc->last_cookie = array->last_cookie;
+                desc->page_index++;
+        }
+        nfs_readdir_release_array(desc->page);
+out:
+        return status;
+}
+/* Fill a page with xdr information before transferring to the cache page */
+static
+int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
+                        struct nfs_entry *entry, struct file *file, struct inode *inode)
 {
-        struct file     *file = desc->file;
-        struct inode    *inode = file->f_path.dentry->d_inode;
        struct rpc_cred *cred = nfs_file_cred(file);
        unsigned long   timestamp, gencount;
        int             error;
-        dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
-                        __func__, (long long)desc->entry->cookie,
-                        page->index);
 again:
        timestamp = jiffies;
        gencount = nfs_inc_attr_generation_counter();
-        error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
+        error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
                                          NFS_SERVER(inode)->dtsize, desc->plus);
        if (error < 0) {
                /* We requested READDIRPLUS, but the server doesn't grok it */
@@ -208,199 +375,312 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
        }
        desc->timestamp = timestamp;
        desc->gencount = gencount;
-        desc->timestamp_valid = 1;
+error:
-        SetPageUptodate(page);
+        return error;
-        /* Ensure consistent page alignment of the data.
-         * Note: assumes we have exclusive access to this mapping either
-         *       through inode->i_mutex or some other mechanism.
-         */
-        if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
-                /* Should never happen */
-                nfs_zap_mapping(inode, inode->i_mapping);
-        }
-        unlock_page(page);
-        return 0;
- error:
-        unlock_page(page);
-        return -EIO;
 }
-static inline
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
-int dir_decode(nfs_readdir_descriptor_t *desc)
+                      struct nfs_entry *entry, struct xdr_stream *xdr)
-{
+{
-        __be32  *p = desc->ptr;
+        int error;
-        p = desc->decode(p, desc->entry, desc->plus);
-        if (IS_ERR(p))
+        error = desc->decode(xdr, entry, desc->plus);
-                return PTR_ERR(p);
+        if (error)
-        desc->ptr = p;
+                return error;
-        if (desc->timestamp_valid) {
+        entry->fattr->time_start = desc->timestamp;
-                desc->entry->fattr->time_start = desc->timestamp;
+        entry->fattr->gencount = desc->gencount;
-                desc->entry->fattr->gencount = desc->gencount;
-        } else
-                desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
        return 0;
 }
-static inline
+static
-void dir_page_release(nfs_readdir_descriptor_t *desc)
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 {
-        kunmap(desc->page);
+        if (dentry->d_inode == NULL)
-        page_cache_release(desc->page);
+                goto different;
-        desc->page = NULL;
+        if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
-        desc->ptr = NULL;
+                goto different;
+        return 1;
+different:
+        return 0;
 }
-/*
+static
- * Given a pointer to a buffer that has already been filled by a call
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
- * to readdir, find the next entry with cookie '*desc->dir_cookie'.
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
- */
-static inline
-int find_dirent(nfs_readdir_descriptor_t *desc)
 {
-        struct nfs_entry *entry = desc->entry;
+        struct qstr filename = {
-        int             loop_count = 0,
+                .len = entry->len,
-                        status;
+                .name = entry->name,
+        };
+        struct dentry *dentry;
+        struct dentry *alias;
+        struct inode *dir = parent->d_inode;
+        struct inode *inode;
-        while((status = dir_decode(desc)) == 0) {
+        if (filename.name[0] == '.') {
-                dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n",
+                if (filename.len == 1)
-                                __func__, (unsigned long long)entry->cookie);
+                        return;
-                if (entry->prev_cookie == *desc->dir_cookie)
+                if (filename.len == 2 && filename.name[1] == '.')
-                        break;
+                        return;
-                if (loop_count++ > 200) {
+        }
-                        loop_count = 0;
+        filename.hash = full_name_hash(filename.name, filename.len);
-                        schedule();
+        dentry = d_lookup(parent, &filename);
+        if (dentry != NULL) {
+                if (nfs_same_file(dentry, entry)) {
+                        nfs_refresh_inode(dentry->d_inode, entry->fattr);
+                        goto out;
+                } else {
+                        d_drop(dentry);
+                        dput(dentry);
                }
        }
-        return status;
+        dentry = d_alloc(parent, &filename);
+        if (dentry == NULL)
+                return;
+        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+        if (IS_ERR(inode))
+                goto out;
+        alias = d_materialise_unique(dentry, inode);
+        if (IS_ERR(alias))
+                goto out;
+        else if (alias) {
+                nfs_set_verifier(alias, nfs_save_change_attribute(dir));
+                dput(alias);
+        } else
+                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+out:
+        dput(dentry);
 }
-/*
+/* Perform conversion from xdr to cache array */
- * Given a pointer to a buffer that has already been filled by a call
+static
- * to readdir, find the entry at offset 'desc->file->f_pos'.
+int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
- *
+                                struct page **xdr_pages, struct page *page, unsigned int buflen)
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
- */
-static inline
-int find_dirent_index(nfs_readdir_descriptor_t *desc)
 {
-        struct nfs_entry *entry = desc->entry;
+        struct xdr_stream stream;
-        int             loop_count = 0,
+        struct xdr_buf buf = {
-                        status;
+                .pages = xdr_pages,
+                .page_len = buflen,
+                .buflen = buflen,
+                .len = buflen,
+        };
+        struct page *scratch;
+        struct nfs_cache_array *array;
+        unsigned int count = 0;
+        int status;
-        for(;;) {
+        scratch = alloc_page(GFP_KERNEL);
-                status = dir_decode(desc);
+        if (scratch == NULL)
-                if (status)
+                return -ENOMEM;
-                        break;
-                dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n",
+        xdr_init_decode(&stream, &buf, NULL);
-                                (unsigned long long)entry->cookie, desc->current_index);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
-                if (desc->file->f_pos == desc->current_index) {
+        do {
-                        *desc->dir_cookie = entry->cookie;
+                status = xdr_decode(desc, entry, &stream);
+                if (status != 0) {
+                        if (status == -EAGAIN)
+                                status = 0;
                        break;
                }
-                desc->current_index++;
-                if (loop_count++ > 200) {
+                count++;
-                        loop_count = 0;
-                        schedule();
+                if (desc->plus != 0)
-                }
+                        nfs_prime_dcache(desc->file->f_path.dentry, entry);
+                status = nfs_readdir_add_to_array(entry, page);
+                if (status != 0)
+                        break;
+        } while (!entry->eof);
+        if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
+                array = nfs_readdir_get_array(page);
+                if (!IS_ERR(array)) {
+                        array->eof_index = array->size;
+                        status = 0;
+                        nfs_readdir_release_array(page);
+                } else
+                        status = PTR_ERR(array);
        }
+        put_page(scratch);
        return status;
 }
+static
+void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+{
+        unsigned int i;
+        for (i = 0; i < npages; i++)
+                put_page(pages[i]);
+}
+static
+void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+                unsigned int npages)
+{
+        nfs_readdir_free_pagearray(pages, npages);
+}
 /*
- * Find the given page, and call find_dirent() or find_dirent_index in
+ * nfs_readdir_large_page will allocate pages that must be freed with a call
- * order to try to return the next entry.
+ * to nfs_readdir_free_large_page
 */
-static inline
+static
-int find_dirent_page(nfs_readdir_descriptor_t *desc)
+int nfs_readdir_large_page(struct page **pages, unsigned int npages)
 {
-        struct inode    *inode = desc->file->f_path.dentry->d_inode;
+        unsigned int i;
-        struct page     *page;
-        int             status;
-        dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n",
+        for (i = 0; i < npages; i++) {
-                        __func__, desc->page_index,
+                struct page *page = alloc_page(GFP_KERNEL);
-                        (long long) *desc->dir_cookie);
+                if (page == NULL)
+                        goto out_freepages;
+                pages[i] = page;
+        }
+        return 0;
-        /* If we find the page in the page_cache, we cannot be sure
+out_freepages:
-         * how fresh the data is, so we will ignore readdir_plus attributes.
+        nfs_readdir_free_pagearray(pages, i);
-         */
+        return -ENOMEM;
-        desc->timestamp_valid = 0;
+}
-        page = read_cache_page(inode->i_mapping, desc->page_index,
-                               (filler_t *)nfs_readdir_filler, desc);
+static
-        if (IS_ERR(page)) {
+int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
-                status = PTR_ERR(page);
+{
+        struct page *pages[NFS_MAX_READDIR_PAGES];
+        void *pages_ptr = NULL;
+        struct nfs_entry entry;
+        struct file     *file = desc->file;
+        struct nfs_cache_array *array;
+        int status = -ENOMEM;
+        unsigned int array_size = ARRAY_SIZE(pages);
+        entry.prev_cookie = 0;
+        entry.cookie = desc->last_cookie;
+        entry.eof = 0;
+        entry.fh = nfs_alloc_fhandle();
+        entry.fattr = nfs_alloc_fattr();
+        entry.server = NFS_SERVER(inode);
+        if (entry.fh == NULL || entry.fattr == NULL)
+                goto out;
+        array = nfs_readdir_get_array(page);
+        if (IS_ERR(array)) {
+                status = PTR_ERR(array);
                goto out;
        }
+        memset(array, 0, sizeof(struct nfs_cache_array));
+        array->eof_index = -1;
-        /* NOTE: Someone else may have changed the READDIRPLUS flag */
+        status = nfs_readdir_large_page(pages, array_size);
-        desc->page = page;
-        desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
-        if (*desc->dir_cookie != 0)
-                status = find_dirent(desc);
-        else
-                status = find_dirent_index(desc);
        if (status < 0)
-                dir_page_release(desc);
+                goto out_release_array;
- out:
+        do {
-        dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
+                unsigned int pglen;
+                status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
+                if (status < 0)
+                        break;
+                pglen = status;
+                status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+                if (status < 0) {
+                        if (status == -ENOSPC)
+                                status = 0;
+                        break;
+                }
+        } while (array->eof_index < 0);
+        nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+out_release_array:
+        nfs_readdir_release_array(page);
+out:
+        nfs_free_fattr(entry.fattr);
+        nfs_free_fhandle(entry.fh);
        return status;
 }
 /*
- * Recurse through the page cache pages, and return a
+ * Now we cache directories properly, by converting xdr information
- * filled nfs_entry structure of the next directory entry if possible.
+ * to an array that can be used for lookups later.  This results in
- *
+ * fewer cache pages, since we can store more information on each page.
- * The target for the search is '*desc->dir_cookie' if non-0,
+ * We only need to convert from xdr once so future lookups are much simpler
- * 'desc->file->f_pos' otherwise
 */
-static inline
+static
-int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
+int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 {
-        int             loop_count = 0;
+        struct inode    *inode = desc->file->f_path.dentry->d_inode;
-        int             res;
+        int ret;
-        /* Always search-by-index from the beginning of the cache */
+        ret = nfs_readdir_xdr_to_array(desc, page, inode);
-        if (*desc->dir_cookie == 0) {
+        if (ret < 0)
-                dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n",
+                goto error;
-                                (long long)desc->file->f_pos);
+        SetPageUptodate(page);
-                desc->page_index = 0;
-                desc->entry->cookie = desc->entry->prev_cookie = 0;
-                desc->entry->eof = 0;
-                desc->current_index = 0;
-        } else
-                dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
-                                (unsigned long long)*desc->dir_cookie);
-        for (;;) {
+        if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
-                res = find_dirent_page(desc);
+                /* Should never happen */
-                if (res != -EAGAIN)
+                nfs_zap_mapping(inode, inode->i_mapping);
-                        break;
-                /* Align to beginning of next page */
-                desc->page_index ++;
-                if (loop_count++ > 200) {
-                        loop_count = 0;
-                        schedule();
-                }
        }
+        unlock_page(page);
+        return 0;
+ error:
+        unlock_page(page);
+        return ret;
+}
-        dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
+static
-        return res;
+void cache_page_release(nfs_readdir_descriptor_t *desc)
+{
+        if (!desc->page->mapping)
+                nfs_readdir_clear_array(desc->page);
+        page_cache_release(desc->page);
+        desc->page = NULL;
 }
-static inline unsigned int dt_type(struct inode *inode)
+static
+struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+{
+        return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+                        desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+}
+/*
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
+ */
+static
+int find_cache_page(nfs_readdir_descriptor_t *desc)
 {
-        return (inode->i_mode >> 12) & 15;
+        int res;
+        desc->page = get_cache_page(desc);
+        if (IS_ERR(desc->page))
+                return PTR_ERR(desc->page);
+        res = nfs_readdir_search_array(desc);
+        if (res != 0)
+                cache_page_release(desc);
+        return res;
 }
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
+/* Search for desc->dir_cookie from the beginning of the page cache */
+static inline
+int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
+{
+        int res;
+        if (desc->page_index == 0) {
+                desc->current_index = 0;
+                desc->last_cookie = 0;
+        }
+        do {
+                res = find_cache_page(desc);
+        } while (res == -EAGAIN);
+        return res;
+}
 /*
 * Once we've found the start of the dirent within a page: fill 'er up...
@@ -410,51 +690,38 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
                   filldir_t filldir)
 {
        struct file     *file = desc->file;
-        struct nfs_entry *entry = desc->entry;
+        int i = 0;
-        struct dentry   *dentry = NULL;
+        int res = 0;
-        u64             fileid;
+        struct nfs_cache_array *array = NULL;
-        int             loop_count = 0,
-                        res;
-        dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
-                        (unsigned long long)entry->cookie);
-        for(;;) {
-                unsigned d_type = DT_UNKNOWN;
-                /* Note: entry->prev_cookie contains the cookie for
-                 *       retrieving the current dirent on the server */
-                fileid = entry->ino;
-                /* Get a dentry if we have one */
-                if (dentry != NULL)
-                        dput(dentry);
-                dentry = nfs_readdir_lookup(desc);
-                /* Use readdirplus info */
+        array = nfs_readdir_get_array(desc->page);
-                if (dentry != NULL && dentry->d_inode != NULL) {
+        if (IS_ERR(array)) {
-                        d_type = dt_type(dentry->d_inode);
+                res = PTR_ERR(array);
-                        fileid = NFS_FILEID(dentry->d_inode);
+                goto out;
-                }
+        }
-                res = filldir(dirent, entry->name, entry->len, 
+        for (i = desc->cache_entry_index; i < array->size; i++) {
-                              file->f_pos, nfs_compat_user_ino64(fileid),
+                struct nfs_cache_array_entry *ent;
-                              d_type);
-                if (res < 0)
+                ent = &array->array[i];
-                        break;
+                if (filldir(dirent, ent->string.name, ent->string.len,
-                file->f_pos++;
+                    file->f_pos, nfs_compat_user_ino64(ent->ino),
-                *desc->dir_cookie = entry->cookie;
+                    ent->d_type) < 0) {
-                if (dir_decode(desc) != 0) {
+                        desc->eof = 1;
-                        desc->page_index ++;
                        break;
                }
-                if (loop_count++ > 200) {
+                file->f_pos++;
-                        loop_count = 0;
+                if (i < (array->size-1))
-                        schedule();
+                        *desc->dir_cookie = array->array[i+1].cookie;
-                }
+                else
+                        *desc->dir_cookie = array->last_cookie;
        }
-        dir_page_release(desc);
+        if (array->eof_index >= 0)
-        if (dentry != NULL)
+                desc->eof = 1;
-                dput(dentry);
+        nfs_readdir_release_array(desc->page);
+out:
+        cache_page_release(desc);
        dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
                        (unsigned long long)*desc->dir_cookie, res);
        return res;
@@ -476,12 +743,9 @@ static inline
 int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                     filldir_t filldir)
 {
-        struct file     *file = desc->file;
-        struct inode    *inode = file->f_path.dentry->d_inode;
-        struct rpc_cred *cred = nfs_file_cred(file);
        struct page     *page = NULL;
        int             status;
-        unsigned long   timestamp, gencount;
+        struct inode *inode = desc->file->f_path.dentry->d_inode;
        dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
                        (unsigned long long)*desc->dir_cookie);
@@ -491,38 +755,23 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                status = -ENOMEM;
                goto out;
        }
-        timestamp = jiffies;
-        gencount = nfs_inc_attr_generation_counter();
+        desc->page_index = 0;
-        status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
+        desc->last_cookie = *desc->dir_cookie;
-                                                *desc->dir_cookie, page,
-                                                NFS_SERVER(inode)->dtsize,
-                                                desc->plus);
        desc->page = page;
-        desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
-        if (status >= 0) {
+        status = nfs_readdir_xdr_to_array(desc, page, inode);
-                desc->timestamp = timestamp;
-                desc->gencount = gencount;
-                desc->timestamp_valid = 1;
-                if ((status = dir_decode(desc)) == 0)
-                        desc->entry->prev_cookie = *desc->dir_cookie;
-        } else
-                status = -EIO;
        if (status < 0)
                goto out_release;
        status = nfs_do_filldir(desc, dirent, filldir);
-        /* Reset read descriptor so it searches the page cache from
-         * the start upon the next call to readdir_search_pagecache() */
-        desc->page_index = 0;
-        desc->entry->cookie = desc->entry->prev_cookie = 0;
-        desc->entry->eof = 0;
 out:
        dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
                        __func__, status);
        return status;
 out_release:
-        dir_page_release(desc);
+        cache_page_release(desc);
        goto out;
 }
@@ -536,8 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode    *inode = dentry->d_inode;
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
-        struct nfs_entry my_entry;
+        int res;
-        int res = -ENOMEM;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -557,57 +805,44 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        desc->decode = NFS_PROTO(inode)->decode_dirent;
        desc->plus = NFS_USE_READDIRPLUS(inode);
-        my_entry.cookie = my_entry.prev_cookie = 0;
-        my_entry.eof = 0;
-        my_entry.fh = nfs_alloc_fhandle();
-        my_entry.fattr = nfs_alloc_fattr();
-        if (my_entry.fh == NULL || my_entry.fattr == NULL)
-                goto out_alloc_failed;
-        desc->entry = &my_entry;
        nfs_block_sillyrename(dentry);
        res = nfs_revalidate_mapping(inode, filp->f_mapping);
        if (res < 0)
                goto out;
-        while(!desc->entry->eof) {
+        do {
                res = readdir_search_pagecache(desc);
                if (res == -EBADCOOKIE) {
+                        res = 0;
                        /* This means either end of directory */
-                        if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) {
+                        if (*desc->dir_cookie && desc->eof == 0) {
                                /* Or that the server has 'lost' a cookie */
                                res = uncached_readdir(desc, dirent, filldir);
-                                if (res >= 0)
+                                if (res == 0)
                                        continue;
                        }
-                        res = 0;
                        break;
                }
                if (res == -ETOOSMALL && desc->plus) {
                        clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        nfs_zap_caches(inode);
+                        desc->page_index = 0;
                        desc->plus = 0;
-                        desc->entry->eof = 0;
+                        desc->eof = 0;
                        continue;
                }
                if (res < 0)
                        break;
                res = nfs_do_filldir(desc, dirent, filldir);
-                if (res < 0) {
+                if (res < 0)
-                        res = 0;
                        break;
-                }
+        } while (!desc->eof);
-        }
 out:
        nfs_unblock_sillyrename(dentry);
        if (res > 0)
                res = 0;
-out_alloc_failed:
-        nfs_free_fattr(my_entry.fattr);
-        nfs_free_fhandle(my_entry.fh);
        dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        res);
@@ -703,7 +938,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 * component of the path.
 * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT.
 */
-static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask)
+static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
+                                                unsigned int mask)
 {
        if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))
                return 0;
@@ -734,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags))
+        if (IS_AUTOMOUNT(inode))
                return 0;
        if (nd != NULL) {
                /* VFS wants an on-the-wire revalidation */
@@ -783,7 +1019,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
 * If the parent directory is seen to have changed, we throw out the
 * cached dentry and do a new lookup.
 */
-static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
+static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir;
        struct inode *inode;
@@ -792,6 +1028,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        struct nfs_fattr *fattr = NULL;
        int error;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
@@ -882,7 +1121,7 @@ out_error:
 /*
 * This is called from dput() when d_count is going to 0.
 */
-static int nfs_dentry_delete(struct dentry *dentry)
+static int nfs_dentry_delete(const struct dentry *dentry)
 {
        dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -934,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
        .d_revalidate   = nfs_lookup_revalidate,
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
+        .d_automount    = nfs_d_automount,
 };
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -953,8 +1193,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
                goto out;
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
        /*
         * If we're doing an exclusive create, optimize away the lookup
         * but don't hash the dentry.
@@ -982,7 +1220,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                goto out_unblock_sillyrename;
        }
        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
-        res = (struct dentry *)inode;
+        res = ERR_CAST(inode);
        if (IS_ERR(res))
                goto out_unblock_sillyrename;
@@ -1009,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
        .d_revalidate   = nfs_open_revalidate,
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
+        .d_automount    = nfs_d_automount,
 };
 /*
@@ -1029,10 +1268,63 @@ static int is_atomic_open(struct nameidata *nd)
        return 1;
 }
+static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
+{
+        struct path path = {
+                .mnt = nd->path.mnt,
+                .dentry = dentry,
+        };
+        struct nfs_open_context *ctx;
+        struct rpc_cred *cred;
+        fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+        cred = rpc_lookup_cred();
+        if (IS_ERR(cred))
+                return ERR_CAST(cred);
+        ctx = alloc_nfs_open_context(&path, cred, fmode);
+        put_rpccred(cred);
+        if (ctx == NULL)
+                return ERR_PTR(-ENOMEM);
+        return ctx;
+}
+static int do_open(struct inode *inode, struct file *filp)
+{
+        nfs_fscache_set_inode_cookie(inode, filp);
+        return 0;
+}
+static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
+{
+        struct file *filp;
+        int ret = 0;
+        /* If the open_intent is for execute, we have an extra check to make */
+        if (ctx->mode & FMODE_EXEC) {
+                ret = nfs_may_open(ctx->path.dentry->d_inode,
+                                ctx->cred,
+                                nd->intent.open.flags);
+                if (ret < 0)
+                        goto out;
+        }
+        filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
+        if (IS_ERR(filp))
+                ret = PTR_ERR(filp);
+        else
+                nfs_file_set_open_context(filp, ctx);
+out:
+        put_nfs_open_context(ctx);
+        return ret;
+}
 static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
+        struct nfs_open_context *ctx;
+        struct iattr attr;
        struct dentry *res = NULL;
-        int error;
+        struct inode *inode;
+        int open_flags;
+        int err;
        dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1045,7 +1337,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                res = ERR_PTR(-ENAMETOOLONG);
                goto out;
        }
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
        /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
         * the dentry. */
@@ -1054,29 +1345,61 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                goto out;
        }
+        ctx = nameidata_to_nfs_open_context(dentry, nd);
+        res = ERR_CAST(ctx);
+        if (IS_ERR(ctx))
+                goto out;
+        open_flags = nd->intent.open.flags;
+        if (nd->flags & LOOKUP_CREATE) {
+                attr.ia_mode = nd->intent.open.create_mode;
+                attr.ia_valid = ATTR_MODE;
+                attr.ia_mode &= ~current_umask();
+        } else {
+                open_flags &= ~(O_EXCL | O_CREAT);
+                attr.ia_valid = 0;
+        }
        /* Open the file on the server */
-        res = nfs4_atomic_open(dir, dentry, nd);
+        nfs_block_sillyrename(dentry->d_parent);
-        if (IS_ERR(res)) {
+        inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
-                error = PTR_ERR(res);
+        if (IS_ERR(inode)) {
-                switch (error) {
+                nfs_unblock_sillyrename(dentry->d_parent);
+                put_nfs_open_context(ctx);
+                switch (PTR_ERR(inode)) {
                        /* Make a negative dentry */
                        case -ENOENT:
+                                d_add(dentry, NULL);
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
-                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
+                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
+                                res = ERR_CAST(inode);
                                goto out;
                }
-        } else if (res != NULL)
+        }
+        res = d_add_unique(dentry, inode);
+        nfs_unblock_sillyrename(dentry->d_parent);
+        if (res != NULL) {
+                dput(ctx->path.dentry);
+                ctx->path.dentry = dget(res);
                dentry = res;
+        }
+        err = nfs_intent_set_file(nd, ctx);
+        if (err < 0) {
+                if (res != NULL)
+                        dput(res);
+                return ERR_PTR(err);
+        }
 out:
+        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        return res;
 no_open:
        return nfs_lookup(dir, dentry, nd);
@@ -1085,14 +1408,21 @@ no_open:
 static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct dentry *parent = NULL;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        struct inode *dir;
+        struct nfs_open_context *ctx;
        int openflags, ret = 0;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
        if (!is_atomic_open(nd) || d_mountpoint(dentry))
                goto no_open;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
@@ -1112,99 +1442,96 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        /* We can't create new files, or truncate existing ones here */
        openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
+        ctx = nameidata_to_nfs_open_context(dentry, nd);
+        ret = PTR_ERR(ctx);
+        if (IS_ERR(ctx))
+                goto out;
        /*
         * Note: we're not holding inode->i_mutex and so may be racing with
         * operations that change the directory. We therefore save the
         * change attribute *before* we do the RPC call.
         */
-        ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
+        inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
+        if (IS_ERR(inode)) {
+                ret = PTR_ERR(inode);
+                switch (ret) {
+                case -EPERM:
+                case -EACCES:
+                case -EDQUOT:
+                case -ENOSPC:
+                case -EROFS:
+                        goto out_put_ctx;
+                default:
+                        goto out_drop;
+                }
+        }
+        iput(inode);
+        if (inode != dentry->d_inode)
+                goto out_drop;
+        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+        ret = nfs_intent_set_file(nd, ctx);
+        if (ret >= 0)
+                ret = 1;
 out:
        dput(parent);
-        if (!ret)
-                d_drop(dentry);
        return ret;
+out_drop:
+        d_drop(dentry);
+        ret = 0;
+out_put_ctx:
+        put_nfs_open_context(ctx);
+        goto out;
 no_open_dput:
        dput(parent);
 no_open:
        return nfs_lookup_revalidate(dentry, nd);
 }
-#endif /* CONFIG_NFSV4 */
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
+                struct nameidata *nd)
 {
-        struct dentry *parent = desc->file->f_path.dentry;
+        struct nfs_open_context *ctx = NULL;
-        struct inode *dir = parent->d_inode;
+        struct iattr attr;
-        struct nfs_entry *entry = desc->entry;
+        int error;
-        struct dentry *dentry, *alias;
+        int open_flags = 0;
-        struct qstr name = {
-                .name = entry->name,
-                .len = entry->len,
-        };
-        struct inode *inode;
-        unsigned long verf = nfs_save_change_attribute(dir);
-        switch (name.len) {
+        dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
-                case 2:
+                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
-                        if (name.name[0] == '.' && name.name[1] == '.')
-                                return dget_parent(parent);
-                        break;
-                case 1:
-                        if (name.name[0] == '.')
-                                return dget(parent);
-        }
-        spin_lock(&dir->i_lock);
+        attr.ia_mode = mode;
-        if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
+        attr.ia_valid = ATTR_MODE;
-                spin_unlock(&dir->i_lock);
-                return NULL;
-        }
-        spin_unlock(&dir->i_lock);
-        name.hash = full_name_hash(name.name, name.len);
+        if ((nd->flags & LOOKUP_CREATE) != 0) {
-        dentry = d_lookup(parent, &name);
+                open_flags = nd->intent.open.flags;
-        if (dentry != NULL) {
-                /* Is this a positive dentry that matches the readdir info? */
-                if (dentry->d_inode != NULL &&
-                                (NFS_FILEID(dentry->d_inode) == entry->ino ||
-                                d_mountpoint(dentry))) {
-                        if (!desc->plus || entry->fh->size == 0)
-                                return dentry;
-                        if (nfs_compare_fh(NFS_FH(dentry->d_inode),
-                                                entry->fh) == 0)
-                                goto out_renew;
-                }
-                /* No, so d_drop to allow one to be created */
-                d_drop(dentry);
-                dput(dentry);
-        }
-        if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
-                return NULL;
-        if (name.len > NFS_SERVER(dir)->namelen)
-                return NULL;
-        /* Note: caller is already holding the dir->i_mutex! */
-        dentry = d_alloc(parent, &name);
-        if (dentry == NULL)
-                return NULL;
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
-        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
-        if (IS_ERR(inode)) {
-                dput(dentry);
-                return NULL;
-        }
-        alias = d_materialise_unique(dentry, inode);
+                ctx = nameidata_to_nfs_open_context(dentry, nd);
-        if (alias != NULL) {
+                error = PTR_ERR(ctx);
-                dput(dentry);
+                if (IS_ERR(ctx))
-                if (IS_ERR(alias))
+                        goto out_err_drop;
-                        return NULL;
-                dentry = alias;
        }
-out_renew:
+        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
-        nfs_set_verifier(dentry, verf);
+        if (error != 0)
-        return dentry;
+                goto out_put_ctx;
+        if (ctx != NULL) {
+                error = nfs_intent_set_file(nd, ctx);
+                if (error < 0)
+                        goto out_err;
+        }
+        return 0;
+out_put_ctx:
+        if (ctx != NULL)
+                put_nfs_open_context(ctx);
+out_err_drop:
+        d_drop(dentry);
+out_err:
+        return error;
 }
+#endif /* CONFIG_NFSV4 */
 /*
 * Code common to create, mkdir, and mknod.
 */
@@ -1269,7 +1596,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
        if ((nd->flags & LOOKUP_CREATE) != 0)
                open_flags = nd->intent.open.flags;
-        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
+        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
        if (error != 0)
                goto out_err;
        return 0;
@@ -1351,76 +1678,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
        return error;
 }
-static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
-{
-        static unsigned int sillycounter;
-        const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
-        const int      countersize = sizeof(sillycounter)*2;
-        const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
-        char           silly[slen+1];
-        struct qstr    qsilly;
-        struct dentry *sdentry;
-        int            error = -EIO;
-        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
-                dentry->d_parent->d_name.name, dentry->d_name.name, 
-                atomic_read(&dentry->d_count));
-        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
-        /*
-         * We don't allow a dentry to be silly-renamed twice.
-         */
-        error = -EBUSY;
-        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
-                goto out;
-        sprintf(silly, ".nfs%*.*Lx",
-                fileidsize, fileidsize,
-                (unsigned long long)NFS_FILEID(dentry->d_inode));
-        /* Return delegation in anticipation of the rename */
-        nfs_inode_return_delegation(dentry->d_inode);
-        sdentry = NULL;
-        do {
-                char *suffix = silly + slen - countersize;
-                dput(sdentry);
-                sillycounter++;
-                sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
-                dfprintk(VFS, "NFS: trying to rename %s to %s\n",
-                                dentry->d_name.name, silly);
-                
-                sdentry = lookup_one_len(silly, dentry->d_parent, slen);
-                /*
-                 * N.B. Better to return EBUSY here ... it could be
-                 * dangerous to delete the file while it's in use.
-                 */
-                if (IS_ERR(sdentry))
-                        goto out;
-        } while(sdentry->d_inode != NULL); /* need negative lookup */
-        qsilly.name = silly;
-        qsilly.len  = strlen(silly);
-        if (dentry->d_inode) {
-                error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
-                                dir, &qsilly);
-                nfs_mark_for_revalidate(dentry->d_inode);
-        } else
-                error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
-                                dir, &qsilly);
-        if (!error) {
-                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                d_move(dentry, sdentry);
-                error = nfs_async_unlink(dir, dentry);
-                /* If we return 0 we don't unlink */
-        }
-        dput(sdentry);
-out:
-        return error;
-}
 /*
 * Remove a file after making sure there are no pending writes,
 * and after checking that the file has only one user. 
@@ -1471,11 +1728,9 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
                dir->i_ino, dentry->d_name.name);
-        spin_lock(&dcache_lock);
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count) > 1) {
+        if (dentry->d_count > 1) {
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
                /* Start asynchronous writeout of the inode */
                write_inode_now(dentry->d_inode, 0);
                error = nfs_sillyrename(dir, dentry);
@@ -1486,7 +1741,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
                need_rehash = 1;
        }
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        error = nfs_safe_remove(dentry);
        if (!error || error == -ENOENT) {
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
@@ -1580,7 +1834,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
        d_drop(dentry);
        error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
        if (error == 0) {
-                atomic_inc(&inode->i_count);
+                ihold(inode);
                d_add(dentry, inode);
        }
        return error;
@@ -1621,7 +1875,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
                 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
-                 atomic_read(&new_dentry->d_count));
+                 new_dentry->d_count);
        /*
         * For non-directories, check whether the target is busy and if so,
@@ -1639,7 +1893,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        rehash = new_dentry;
                }
-                if (atomic_read(&new_dentry->d_count) > 2) {
+                if (new_dentry->d_count > 2) {
                        int err;
                        /* copy the target dentry's name */
@@ -1711,14 +1965,14 @@ static void nfs_access_free_list(struct list_head *head)
 int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(head);
-        struct nfs_inode *nfsi;
+        struct nfs_inode *nfsi, *next;
        struct nfs_access_entry *cache;
        if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
                return (nr_to_scan == 0) ? 0 : -1;
        spin_lock(&nfs_access_lru_lock);
-        list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+        list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
                struct inode *inode;
                if (nr_to_scan-- == 0)
@@ -1941,11 +2195,14 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
        return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
-int nfs_permission(struct inode *inode, int mask)
+int nfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct rpc_cred *cred;
        int res = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        nfs_inc_stats(inode, NFSIOS_VFSACCESS);
        if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -1993,7 +2250,7 @@ out:
 out_notsup:
        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (res == 0)
-                res = generic_permission(inode, mask, NULL);
+                res = generic_permission(inode, mask, flags, NULL);
        goto out;
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 064a80961677..9943a75bb6d1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -407,15 +407,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
                pos += vec->iov_len;
        }
+        /*
+         * If no bytes were started, return the error, and let the
+         * generic layer handle the completion.
+         */
+        if (requested_bytes == 0) {
+                nfs_direct_req_release(dreq);
+                return result < 0 ? result : -EIO;
+        }
        if (put_dreq(dreq))
                nfs_direct_complete(dreq);
+        return 0;
-        if (requested_bytes != 0)
-                return 0;
-        if (result < 0)
-                return result;
-        return -EIO;
 }
 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
                pos += vec->iov_len;
        }
+        /*
+         * If no bytes were started, return the error, and let the
+         * generic layer handle the completion.
+         */
+        if (requested_bytes == 0) {
+                nfs_direct_req_release(dreq);
+                return result < 0 ? result : -EIO;
+        }
        if (put_dreq(dreq))
                nfs_direct_write_complete(dreq, dreq->inode);
+        return 0;
-        if (requested_bytes != 0)
-                return 0;
-        if (result < 0)
-                return result;
-        return -EIO;
 }
 static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
@@ -867,13 +873,13 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
                goto out;
        nfs_alloc_commit_data(dreq);
-        if (dreq->commit_data == NULL || count < wsize)
+        if (dreq->commit_data == NULL || count <= wsize)
                sync = NFS_FILE_SYNC;
        dreq->inode = inode;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
-        if (dreq->l_ctx != NULL)
+        if (dreq->l_ctx == NULL)
                goto out_release;
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index dba50a5625db..a6e711ad130f 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -167,7 +167,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
                return 0;
        }
        item = container_of(h, struct nfs_dns_ent, h);
-        ttl = (long)item->h.expiry_time - (long)get_seconds();
+        ttl = item->h.expiry_time - seconds_since_boot();
        if (ttl < 0)
                ttl = 0;
@@ -239,7 +239,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
        ttl = get_expiry(&buf);
        if (ttl == 0)
                goto out;
-        key.h.expiry_time = ttl + get_seconds();
+        key.h.expiry_time = ttl + seconds_since_boot();
        ret = -ENOMEM;
        item = nfs_dns_lookup(cd, &key);
@@ -301,7 +301,7 @@ static int do_cache_lookup_nowait(struct cache_detail *cd,
                goto out_err;
        ret = -ETIMEDOUT;
        if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-                        || (*item)->h.expiry_time < get_seconds()
+                        || (*item)->h.expiry_time < seconds_since_boot()
                        || cd->flush_time > (*item)->h.last_refresh)
                goto out_put;
        ret = -ENOENT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05bf3c0dc751..7bf029ef4084 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_FILE
@@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
+        pnfs_update_layout(mapping->host,
+                           nfs_file_open_context(file),
+                           IOMODE_RW);
 start:
        /*
         * Prevent starvation issues if someone is doing a consistency
@@ -551,7 +556,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct file *filp = vma->vm_file;
        struct dentry *dentry = filp->f_path.dentry;
        unsigned pagelen;
-        int ret = -EINVAL;
+        int ret = VM_FAULT_NOPAGE;
        struct address_space *mapping;
        dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
@@ -567,21 +572,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (mapping != dentry->d_inode->i_mapping)
                goto out_unlock;
-        ret = 0;
        pagelen = nfs_page_length(page);
        if (pagelen == 0)
                goto out_unlock;
-        ret = nfs_flush_incompatible(filp, page);
+        ret = VM_FAULT_LOCKED;
-        if (ret != 0)
+        if (nfs_flush_incompatible(filp, page) == 0 &&
-                goto out_unlock;
+            nfs_updatepage(filp, page, 0, pagelen) == 0)
+                goto out;
-        ret = nfs_updatepage(filp, page, 0, pagelen);
+        ret = VM_FAULT_SIGBUS;
 out_unlock:
-        if (!ret)
-                return VM_FAULT_LOCKED;
        unlock_page(page);
-        return VM_FAULT_SIGBUS;
+out:
+        return ret;
 }
 static const struct vm_operations_struct nfs_file_vm_ops = {
@@ -684,10 +688,12 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
        return ret;
 }
-static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status = 0;
+        unsigned int saved_type = fl->fl_type;
        /* Try local locking first */
        posix_test_lock(filp, fl);
@@ -695,11 +701,12 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
                /* found a conflict */
                goto out;
        }
+        fl->fl_type = saved_type;
        if (nfs_have_delegation(inode, FMODE_READ))
                goto out_noconflict;
-        if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)
+        if (is_local)
                goto out_noconflict;
        status = NFS_PROTO(inode)->lock(filp, cmd, fl);
@@ -726,7 +733,8 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
        return res;
 }
-static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status;
@@ -741,15 +749,24 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
         *      If we're signalled while cleaning up locks on process exit, we
         *      still need to complete the unlock.
         */
-        /* Use local locking if mounted with "-onolock" */
+        /*
-        if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
+         * Use local locking if mounted with "-onolock" or with appropriate
+         * "-olocal_lock="
+         */
+        if (!is_local)
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
                status = do_vfs_lock(filp, fl);
        return status;
 }
-static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+is_time_granular(struct timespec *ts) {
+        return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
+}
+static int
+do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status;
@@ -762,20 +779,31 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
        if (status != 0)
                goto out;
-        /* Use local locking if mounted with "-onolock" */
+        /*
-        if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
+         * Use local locking if mounted with "-onolock" or with appropriate
+         * "-olocal_lock="
+         */
+        if (!is_local)
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
                status = do_vfs_lock(filp, fl);
        if (status < 0)
                goto out;
        /*
-         * Make sure we clear the cache whenever we try to get the lock.
+         * Revalidate the cache if the server has time stamps granular
+         * enough to detect subsecond changes.  Otherwise, clear the
+         * cache to prevent missing any changes.
+         *
         * This makes locking act as a cache coherency point.
         */
        nfs_sync_mapping(filp->f_mapping);
-        if (!nfs_have_delegation(inode, FMODE_READ))
+        if (!nfs_have_delegation(inode, FMODE_READ)) {
-                nfs_zap_caches(inode);
+                if (is_time_granular(&NFS_SERVER(inode)->time_delta))
+                        __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+                else
+                        nfs_zap_caches(inode);
+        }
 out:
        return status;
 }
@@ -787,6 +815,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
        struct inode *inode = filp->f_mapping->host;
        int ret = -ENOLCK;
+        int is_local = 0;
        dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
                        filp->f_path.dentry->d_parent->d_name.name,
@@ -800,6 +829,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
                goto out_err;
+        if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
+                is_local = 1;
        if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
                ret = NFS_PROTO(inode)->lock_check_bounds(fl);
                if (ret < 0)
@@ -807,11 +839,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
        }
        if (IS_GETLK(cmd))
-                ret = do_getlk(filp, cmd, fl);
+                ret = do_getlk(filp, cmd, fl, is_local);
        else if (fl->fl_type == F_UNLCK)
-                ret = do_unlk(filp, cmd, fl);
+                ret = do_unlk(filp, cmd, fl, is_local);
        else
-                ret = do_setlk(filp, cmd, fl);
+                ret = do_setlk(filp, cmd, fl, is_local);
 out_err:
        return ret;
 }
@@ -821,6 +853,9 @@ out_err:
 */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
+        struct inode *inode = filp->f_mapping->host;
+        int is_local = 0;
        dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
                        filp->f_path.dentry->d_parent->d_name.name,
                        filp->f_path.dentry->d_name.name,
@@ -829,14 +864,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
+        if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
+                is_local = 1;
        /* We're simulating flock() locks using posix locks on the server */
        fl->fl_owner = (fl_owner_t)filp;
        fl->fl_start = 0;
        fl->fl_end = OFFSET_MAX;
        if (fl->fl_type == F_UNLCK)
-                return do_unlk(filp, cmd, fl);
+                return do_unlk(filp, cmd, fl, is_local);
-        return do_setlk(filp, cmd, fl);
+        return do_setlk(filp, cmd, fl, is_local);
 }
 /*
@@ -848,6 +886,5 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
        dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
                        file->f_path.dentry->d_parent->d_name.name,
                        file->f_path.dentry->d_name.name, arg);
        return -EINVAL;
 }
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446e1605..b5ffe8fa291f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
                        iput(inode);
                        return -ENOMEM;
                }
-                /* Circumvent igrab(): we know the inode is not being freed */
+                ihold(inode);
-                atomic_inc(&inode->i_count);
                /*
                 * Ensure that this dentry is invisible to d_find_alias().
                 * Otherwise, it may be spliced into the tree by
@@ -64,9 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
                 * This again causes shrink_dcache_for_umount_subtree() to
                 * Oops, since the test for IS_ROOT() will fail.
                 */
-                spin_lock(&dcache_lock);
+                spin_lock(&sb->s_root->d_inode->i_lock);
+                spin_lock(&sb->s_root->d_lock);
                list_del_init(&sb->s_root->d_alias);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&sb->s_root->d_lock);
+                spin_unlock(&sb->s_root->d_inode->i_lock);
        }
        return 0;
 }
@@ -118,9 +119,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        }
        security_d_instantiate(ret, inode);
-        if (ret->d_op == NULL)
-                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
 out:
        nfs_free_fattr(fsinfo.fattr);
        return ret;
@@ -226,9 +224,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        security_d_instantiate(ret, inode);
-        if (ret->d_op == NULL)
-                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
 out:
        nfs_free_fattr(fattr);
        dprintk("<-- nfs4_get_root()\n");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 21a84d45916f..18696882f1c6 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,6 +34,212 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/nfs_idmap.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <keys/user-type.h>
+#define NFS_UINT_MAXLEN 11
+const struct cred *id_resolver_cache;
+struct key_type key_type_id_resolver = {
+        .name           = "id_resolver",
+        .instantiate    = user_instantiate,
+        .match          = user_match,
+        .revoke         = user_revoke,
+        .destroy        = user_destroy,
+        .describe       = user_describe,
+        .read           = user_read,
+};
+int nfs_idmap_init(void)
+{
+        struct cred *cred;
+        struct key *keyring;
+        int ret = 0;
+        printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
+        cred = prepare_kernel_cred(NULL);
+        if (!cred)
+                return -ENOMEM;
+        keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
+                             (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                             KEY_USR_VIEW | KEY_USR_READ,
+                             KEY_ALLOC_NOT_IN_QUOTA);
+        if (IS_ERR(keyring)) {
+                ret = PTR_ERR(keyring);
+                goto failed_put_cred;
+        }
+        ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+        if (ret < 0)
+                goto failed_put_key;
+        ret = register_key_type(&key_type_id_resolver);
+        if (ret < 0)
+                goto failed_put_key;
+        cred->thread_keyring = keyring;
+        cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+        id_resolver_cache = cred;
+        return 0;
+failed_put_key:
+        key_put(keyring);
+failed_put_cred:
+        put_cred(cred);
+        return ret;
+}
+void nfs_idmap_quit(void)
+{
+        key_revoke(id_resolver_cache->thread_keyring);
+        unregister_key_type(&key_type_id_resolver);
+        put_cred(id_resolver_cache);
+}
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it.  The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned.  Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+                                const char *type, size_t typelen, char **desc)
+{
+        char *cp;
+        size_t desclen = typelen + namelen + 2;
+        *desc = kmalloc(desclen, GFP_KERNEL);
+        if (!*desc)
+                return -ENOMEM;
+        cp = *desc;
+        memcpy(cp, type, typelen);
+        cp += typelen;
+        *cp++ = ':';
+        memcpy(cp, name, namelen);
+        cp += namelen;
+        *cp = '\0';
+        return desclen;
+}
+static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
+                const char *type, void *data, size_t data_size)
+{
+        const struct cred *saved_cred;
+        struct key *rkey;
+        char *desc;
+        struct user_key_payload *payload;
+        ssize_t ret;
+        ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+        if (ret <= 0)
+                goto out;
+        saved_cred = override_creds(id_resolver_cache);
+        rkey = request_key(&key_type_id_resolver, desc, "");
+        revert_creds(saved_cred);
+        kfree(desc);
+        if (IS_ERR(rkey)) {
+                ret = PTR_ERR(rkey);
+                goto out;
+        }
+        rcu_read_lock();
+        rkey->perm |= KEY_USR_VIEW;
+        ret = key_validate(rkey);
+        if (ret < 0)
+                goto out_up;
+        payload = rcu_dereference(rkey->payload.data);
+        if (IS_ERR_OR_NULL(payload)) {
+                ret = PTR_ERR(payload);
+                goto out_up;
+        }
+        ret = payload->datalen;
+        if (ret > 0 && ret <= data_size)
+                memcpy(data, payload->data, ret);
+        else
+                ret = -EINVAL;
+out_up:
+        rcu_read_unlock();
+        key_put(rkey);
+out:
+        return ret;
+}
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
+{
+        char id_str[NFS_UINT_MAXLEN];
+        int id_len;
+        ssize_t ret;
+        id_len = snprintf(id_str, sizeof(id_str), "%u", id);
+        ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
+        if (ret < 0)
+                return -EINVAL;
+        return ret;
+}
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen,
+                                const char *type, __u32 *id)
+{
+        char id_str[NFS_UINT_MAXLEN];
+        long id_long;
+        ssize_t data_size;
+        int ret = 0;
+        data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
+        if (data_size <= 0) {
+                ret = -EINVAL;
+        } else {
+                ret = strict_strtol(id_str, 10, &id_long);
+                *id = (__u32)id_long;
+        }
+        return ret;
+}
+int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+{
+        return nfs_idmap_lookup_id(name, namelen, "uid", uid);
+}
+int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+{
+        return nfs_idmap_lookup_id(name, namelen, "gid", gid);
+}
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+{
+        return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+}
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+{
+        return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+}
+#else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
@@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
        return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
        struct idmap *idmap = clp->cl_idmap;
        return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
        struct idmap *idmap = clp->cl_idmap;
        return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
 }
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7d2d6c72aa78..1cc600e77bb4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,6 +48,7 @@
 #include "internal.h"
 #include "fscache.h"
 #include "dns_resolve.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -234,9 +235,6 @@ nfs_init_locked(struct inode *inode, void *opaque)
        return 0;
 }
-/* Don't use READDIRPLUS on directories that we believe are too large */
-#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE)
 /*
 * This is our front-end to iget that looks up inodes by file handle
 * instead of inode number.
@@ -291,8 +289,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                } else if (S_ISDIR(inode->i_mode)) {
                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
-                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
+                        inode->i_data.a_ops = &nfs_dir_aops;
-                            && fattr->size <= NFS_LIMIT_READDIRPLUS)
+                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        /* Deal with crossing mountpoints */
                        if ((fattr->valid & NFS_ATTR_FATTR_FSID)
@@ -302,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                                else
                                        inode->i_op = &nfs_mountpoint_inode_operations;
                                inode->i_fop = NULL;
-                                set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags);
+                                inode->i_flags |= S_AUTOMOUNT;
                        }
                } else if (S_ISLNK(inode->i_mode))
                        inode->i_op = &nfs_symlink_inode_operations;
@@ -623,7 +621,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
        nfs_revalidate_inode(server, inode);
 }
-static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
+struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
 {
        struct nfs_open_context *ctx;
@@ -633,11 +631,13 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
                path_get(&ctx->path);
                ctx->cred = get_rpccred(cred);
                ctx->state = NULL;
+                ctx->mode = f_mode;
                ctx->flags = 0;
                ctx->error = 0;
                ctx->dir_cookie = 0;
                nfs_init_lock_context(&ctx->lock_context);
                ctx->lock_context.open_context = ctx;
+                INIT_LIST_HEAD(&ctx->list);
        }
        return ctx;
 }
@@ -653,11 +653,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
        struct inode *inode = ctx->path.dentry->d_inode;
-        if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+        if (!list_empty(&ctx->list)) {
+                if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+                        return;
+                list_del(&ctx->list);
+                spin_unlock(&inode->i_lock);
+        } else if (!atomic_dec_and_test(&ctx->lock_context.count))
                return;
-        list_del(&ctx->list);
+        if (inode != NULL)
-        spin_unlock(&inode->i_lock);
+                NFS_PROTO(inode)->close_context(ctx, is_sync);
-        NFS_PROTO(inode)->close_context(ctx, is_sync);
        if (ctx->cred != NULL)
                put_rpccred(ctx->cred);
        path_put(&ctx->path);
@@ -673,7 +677,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
 * Ensure that mmap has a recent RPC credential for use when writing out
 * shared pages
 */
-static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -730,11 +734,10 @@ int nfs_open(struct inode *inode, struct file *filp)
        cred = rpc_lookup_cred();
        if (IS_ERR(cred))
                return PTR_ERR(cred);
-        ctx = alloc_nfs_open_context(&filp->f_path, cred);
+        ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
        put_rpccred(cred);
        if (ctx == NULL)
                return -ENOMEM;
-        ctx->mode = filp->f_mode;
        nfs_file_set_open_context(filp, ctx);
        put_nfs_open_context(ctx);
        nfs_fscache_set_inode_cookie(inode, filp);
@@ -878,9 +881,10 @@ out:
        return ret;
 }
-static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+        unsigned long ret = 0;
        if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
                        && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -888,25 +892,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                nfsi->change_attr = fattr->change_attr;
                if (S_ISDIR(inode->i_mode))
                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                ret |= NFS_INO_INVALID_ATTR;
        }
        /* If we have atomic WCC data, we may update some attributes */
        if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
                        && (fattr->valid & NFS_ATTR_FATTR_CTIME)
-                        && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+                        && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
-                        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+                memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+                ret |= NFS_INO_INVALID_ATTR;
+        }
        if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
                        && (fattr->valid & NFS_ATTR_FATTR_MTIME)
                        && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
-                        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+                memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-                        if (S_ISDIR(inode->i_mode))
+                if (S_ISDIR(inode->i_mode))
-                                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                ret |= NFS_INO_INVALID_ATTR;
        }
        if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
                        && (fattr->valid & NFS_ATTR_FATTR_SIZE)
                        && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
-                        && nfsi->npages == 0)
+                        && nfsi->npages == 0) {
-                        i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+                i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+                ret |= NFS_INO_INVALID_ATTR;
+        }
+        return ret;
 }
 /**
@@ -1205,7 +1216,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        /* Update the fsid? */
        if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
                        !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
-                        !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
+                        !IS_AUTOMOUNT(inode))
                server->fsid = fattr->fsid;
        /*
@@ -1220,7 +1231,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        | NFS_INO_REVAL_PAGECACHE);
        /* Do atomic weak cache consistency updates */
-        nfs_wcc_update_inode(inode, fattr);
+        invalid |= nfs_wcc_update_inode(inode, fattr);
        /* More cache consistency checks */
        if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
@@ -1407,6 +1418,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 */
 void nfs4_evict_inode(struct inode *inode)
 {
+        pnfs_destroy_layout(NFS_I(inode));
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
        /* If we are holding a delegation, return it! */
@@ -1434,11 +1446,18 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
        return &nfsi->vfs_inode;
 }
-void nfs_destroy_inode(struct inode *inode)
+static void nfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
+void nfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, nfs_i_callback);
+}
 static inline void nfs4_init_once(struct nfs_inode *nfsi)
 {
 #ifdef CONFIG_NFS_V4
@@ -1446,6 +1465,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
        nfsi->delegation = NULL;
        nfsi->delegation_state = 0;
        init_rwsem(&nfsi->rwsem);
+        nfsi->layout = NULL;
 #endif
 }
@@ -1493,7 +1513,7 @@ static int nfsiod_start(void)
 {
        struct workqueue_struct *wq;
        dprintk("RPC:       creating workqueue nfsiod\n");
-        wq = create_singlethread_workqueue("nfsiod");
+        wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
        if (wq == NULL)
                return -ENOMEM;
        nfsiod_workqueue = wq;
@@ -1521,6 +1541,10 @@ static int __init init_nfs_fs(void)
 {
        int err;
+        err = nfs_idmap_init();
+        if (err < 0)
+                goto out9;
        err = nfs_dns_resolver_init();
        if (err < 0)
                goto out8;
@@ -1585,6 +1609,8 @@ out6:
 out7:
        nfs_dns_resolver_destroy();
 out8:
+        nfs_idmap_quit();
+out9:
        return err;
 }
@@ -1597,9 +1623,11 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_nfspagecache();
        nfs_fscache_unregister();
        nfs_dns_resolver_destroy();
+        nfs_idmap_quit();
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
+        nfs_cleanup_cb_ident_idr();
        unregister_nfs_fs();
        nfs_fs_proc_exit();
        nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c961bc92c107..cf9fdbdabc67 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,12 @@ struct nfs_clone_mount {
 #define NFS_UNSPEC_PORT         (-1)
 /*
+ * Maximum number of pages that readdir can use for creating
+ * a vmapped array of pages.
+ */
+#define NFS_MAX_READDIR_PAGES 8
+/*
 * In-kernel mount arguments
 */
 struct nfs_parsed_mount_data {
@@ -122,9 +128,12 @@ extern void nfs_umount(const struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
+extern void nfs_cleanup_cb_ident_idr(void);
 extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
+extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
-extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_ident(int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
 extern struct nfs_server *nfs_create_server(
                                        const struct nfs_parsed_mount_data *,
                                        struct nfs_fh *);
@@ -179,17 +188,20 @@ extern int __init nfs_init_directcache(void);
 extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
-extern int nfs_stat_to_errno(int);
+extern int nfs_stat_to_errno(enum nfs_stat);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
+extern int nfs2_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
+extern int nfs3_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
+extern int nfs4_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
@@ -239,6 +251,7 @@ extern char *nfs_path(const char *base,
                      const struct dentry *droot,
                      const struct dentry *dentry,
                      char *buffer, ssize_t buflen);
+extern struct vfsmount *nfs_d_automount(struct path *path);
 /* getroot.c */
 extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
@@ -356,6 +369,15 @@ unsigned int nfs_page_length(struct page *page)
 }
 /*
+ * Convert a umode to a dirent->d_type
+ */
+static inline
+unsigned char nfs_umode_to_dtype(umode_t mode)
+{
+        return (mode >> 12) & 15;
+}
+/*
 * Determine the number of pages in an array of length 'len' and
 * with a base offset of 'base'
 */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 59047f8d7d72..d4c2d6b7507e 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -153,6 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
                .rpc_resp       = &result,
        };
        struct rpc_create_args args = {
+                .net            = &init_net,
                .protocol       = info->protocol,
                .address        = info->sap,
                .addrsize       = info->salen,
@@ -224,6 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
                .to_retries = 2,
        };
        struct rpc_create_args args = {
+                .net            = &init_net,
                .protocol       = IPPROTO_UDP,
                .address        = info->sap,
                .addrsize       = info->salen,
@@ -234,10 +236,8 @@ void nfs_umount(const struct nfs_mount_request *info)
                .authflavor     = RPC_AUTH_UNIX,
                .flags          = RPC_CLNT_CREATE_NOPING,
        };
-        struct mountres result;
        struct rpc_message msg  = {
                .rpc_argp       = info->dirpath,
-                .rpc_resp       = &result,
        };
        struct rpc_clnt *clnt;
        int status;
@@ -246,7 +246,7 @@ void nfs_umount(const struct nfs_mount_request *info)
                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        clnt = rpc_create(&args);
-        if (unlikely(IS_ERR(clnt)))
+        if (IS_ERR(clnt))
                goto out_clnt_err;
        dprintk("NFS: sending UMNT request for %s:%s\n",
@@ -278,29 +278,20 @@ out_call_err:
 * XDR encode/decode functions for MOUNT
 */
-static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
 {
        const u32 pathname_len = strlen(pathname);
        __be32 *p;
-        if (unlikely(pathname_len > MNTPATHLEN))
+        BUG_ON(pathname_len > MNTPATHLEN);
-                return -EIO;
+        p = xdr_reserve_space(xdr, 4 + pathname_len);
-        p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque(p, pathname, pathname_len);
-        return 0;
 }
-static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p,
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
-                           const char *dirpath)
+                                const char *dirpath)
 {
-        struct xdr_stream xdr;
+        encode_mntdirpath(xdr, dirpath);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        return encode_mntdirpath(&xdr, dirpath);
 }
 /*
@@ -318,10 +309,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
        u32 status;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(status));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        status = ntohl(*p);
+        status = be32_to_cpup(p);
        for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
                if (mnt_errtbl[i].status == status) {
@@ -349,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
        return 0;
 }
-static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p,
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
-                            struct mountres *res)
+                                struct xdr_stream *xdr,
+                                struct mountres *res)
 {
-        struct xdr_stream xdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_status(xdr, res);
-        status = decode_status(&xdr, res);
        if (unlikely(status != 0 || res->errno != 0))
                return status;
-        return decode_fhandle(&xdr, res);
+        return decode_fhandle(xdr, res);
 }
 static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
@@ -369,10 +358,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
        u32 status;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(status));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        status = ntohl(*p);
+        status = be32_to_cpup(p);
        for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
                if (mnt3_errtbl[i].status == status) {
@@ -392,11 +381,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
        u32 size;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(size));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        size = ntohl(*p++);
+        size = be32_to_cpup(p);
        if (size > NFS3_FHSIZE || size == 0)
                return -EIO;
@@ -419,15 +408,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        if (*count == 0)
                return 0;
-        p = xdr_inline_decode(xdr, sizeof(entries));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        entries = ntohl(*p);
+        entries = be32_to_cpup(p);
        dprintk("NFS: received %u auth flavors\n", entries);
        if (entries > NFS_MAX_SECFLAVORS)
                entries = NFS_MAX_SECFLAVORS;
-        p = xdr_inline_decode(xdr, sizeof(u32) * entries);
+        p = xdr_inline_decode(xdr, 4 * entries);
        if (unlikely(p == NULL))
                return -EIO;
@@ -435,38 +424,36 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
                entries = *count;
        for (i = 0; i < entries; i++) {
-                flavors[i] = ntohl(*p++);
+                flavors[i] = be32_to_cpup(p++);
-                dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]);
+                dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
        }
        *count = i;
        return 0;
 }
-static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
-                             struct mountres *res)
+                                 struct xdr_stream *xdr,
+                                 struct mountres *res)
 {
-        struct xdr_stream xdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_fhs_status(xdr, res);
-        status = decode_fhs_status(&xdr, res);
        if (unlikely(status != 0 || res->errno != 0))
                return status;
-        status = decode_fhandle3(&xdr, res);
+        status = decode_fhandle3(xdr, res);
        if (unlikely(status != 0)) {
                res->errno = -EBADHANDLE;
                return 0;
        }
-        return decode_auth_flavors(&xdr, res);
+        return decode_auth_flavors(xdr, res);
 }
 static struct rpc_procinfo mnt_procedures[] = {
        [MOUNTPROC_MNT] = {
                .p_proc         = MOUNTPROC_MNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
-                .p_decode       = (kxdrproc_t)mnt_dec_mountres,
+                .p_decode       = (kxdrdproc_t)mnt_xdr_dec_mountres,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_replen       = MNT_dec_mountres_sz,
                .p_statidx      = MOUNTPROC_MNT,
@@ -474,7 +461,7 @@ static struct rpc_procinfo mnt_procedures[] = {
        },
        [MOUNTPROC_UMNT] = {
                .p_proc         = MOUNTPROC_UMNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_statidx      = MOUNTPROC_UMNT,
                .p_name         = "UMOUNT",
@@ -484,8 +471,8 @@ static struct rpc_procinfo mnt_procedures[] = {
 static struct rpc_procinfo mnt3_procedures[] = {
        [MOUNTPROC3_MNT] = {
                .p_proc         = MOUNTPROC3_MNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
-                .p_decode       = (kxdrproc_t)mnt_dec_mountres3,
+                .p_decode       = (kxdrdproc_t)mnt_xdr_dec_mountres3,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_replen       = MNT_dec_mountres3_sz,
                .p_statidx      = MOUNTPROC3_MNT,
@@ -493,7 +480,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
        },
        [MOUNTPROC3_UMNT] = {
                .p_proc         = MOUNTPROC3_UMNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_statidx      = MOUNTPROC3_UMNT,
                .p_name         = "UMOUNT",
@@ -503,13 +490,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
 static struct rpc_version mnt_version1 = {
        .number         = 1,
-        .nrprocs        = 2,
+        .nrprocs        = ARRAY_SIZE(mnt_procedures),
        .procs          = mnt_procedures,
 };
 static struct rpc_version mnt_version3 = {
        .number         = 3,
-        .nrprocs        = 2,
+        .nrprocs        = ARRAY_SIZE(mnt3_procedures),
        .procs          = mnt3_procedures,
 };
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index db6aa3673cf3..f32b8603dca8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -49,12 +49,17 @@ char *nfs_path(const char *base,
               const struct dentry *dentry,
               char *buffer, ssize_t buflen)
 {
-        char *end = buffer+buflen;
+        char *end;
        int namelen;
+        unsigned seq;
+rename_retry:
+        end = buffer+buflen;
        *--end = '\0';
        buflen--;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+        rcu_read_lock();
        while (!IS_ROOT(dentry) && dentry != droot) {
                namelen = dentry->d_name.len;
                buflen -= namelen + 1;
@@ -65,7 +70,9 @@ char *nfs_path(const char *base,
                *--end = '/';
                dentry = dentry->d_parent;
        }
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
        if (*end != '/') {
                if (--buflen < 0)
                        goto Elong;
@@ -82,15 +89,16 @@ char *nfs_path(const char *base,
        memcpy(end, base, namelen);
        return end;
 Elong_unlock:
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
 Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
 /*
- * nfs_follow_mountpoint - handle crossing a mountpoint on the server
+ * nfs_d_automount - Handle crossing a mountpoint on the server
- * @dentry - dentry of mountpoint
+ * @path - The mountpoint
- * @nd - nameidata info
 *
 * When we encounter a mountpoint on the server, we want to set up
 * a mountpoint on the client too, to prevent inode numbers from
@@ -100,87 +108,65 @@ Elong:
 * situation, and that different filesystems may want to use
 * different security flavours.
 */
-static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *nfs_d_automount(struct path *path)
 {
        struct vfsmount *mnt;
-        struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+        struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
        struct dentry *parent;
        struct nfs_fh *fh = NULL;
        struct nfs_fattr *fattr = NULL;
        int err;
-        dprintk("--> nfs_follow_mountpoint()\n");
+        dprintk("--> nfs_d_automount()\n");
-        err = -ESTALE;
+        mnt = ERR_PTR(-ESTALE);
-        if (IS_ROOT(dentry))
+        if (IS_ROOT(path->dentry))
-                goto out_err;
+                goto out_nofree;
-        err = -ENOMEM;
+        mnt = ERR_PTR(-ENOMEM);
        fh = nfs_alloc_fhandle();
        fattr = nfs_alloc_fattr();
        if (fh == NULL || fattr == NULL)
-                goto out_err;
+                goto out;
        dprintk("%s: enter\n", __func__);
-        dput(nd->path.dentry);
-        nd->path.dentry = dget(dentry);
-        /* Look it up again */
+        /* Look it up again to get its attributes */
-        parent = dget_parent(nd->path.dentry);
+        parent = dget_parent(path->dentry);
        err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
-                                                  &nd->path.dentry->d_name,
+                                                  &path->dentry->d_name,
                                                  fh, fattr);
        dput(parent);
-        if (err != 0)
+        if (err != 0) {
-                goto out_err;
+                mnt = ERR_PTR(err);
+                goto out;
+        }
        if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-                mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
+                mnt = nfs_do_refmount(path->mnt, path->dentry);
        else
-                mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
+                mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
-                                      fattr);
-        err = PTR_ERR(mnt);
        if (IS_ERR(mnt))
-                goto out_err;
+                goto out;
-        mntget(mnt);
+        dprintk("%s: done, success\n", __func__);
-        err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
+        mntget(mnt); /* prevent immediate expiration */
-                           &nfs_automount_list);
+        mnt_set_expiry(mnt, &nfs_automount_list);
-        if (err < 0) {
-                mntput(mnt);
-                if (err == -EBUSY)
-                        goto out_follow;
-                goto out_err;
-        }
-        path_put(&nd->path);
-        nd->path.mnt = mnt;
-        nd->path.dentry = dget(mnt->mnt_root);
        schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fh);
-        dprintk("%s: done, returned %d\n", __func__, err);
+out_nofree:
+        dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
-        dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
+        return mnt;
-        return ERR_PTR(err);
-out_err:
-        path_put(&nd->path);
-        goto out;
-out_follow:
-        while (d_mountpoint(nd->path.dentry) &&
-               follow_down(&nd->path))
-                ;
-        err = 0;
-        goto out;
 }
 const struct inode_operations nfs_mountpoint_inode_operations = {
-        .follow_link    = nfs_follow_mountpoint,
        .getattr        = nfs_getattr,
 };
 const struct inode_operations nfs_referral_inode_operations = {
-        .follow_link    = nfs_follow_mountpoint,
 };
 static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index db8846a0e82e..792cb13a4304 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,609 +61,1008 @@
 #define NFS_readdirres_sz       (1)
 #define NFS_statfsres_sz        (1+NFS_info_sz)
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+                                 unsigned int base, unsigned int len,
+                                 unsigned int bufsize)
+{
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        unsigned int replen;
+        replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+/*
+ *      typedef opaque  nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
+{
+        u32 recvd, count;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, count);
+        result->eof = 0;        /* NFSv2 does not pass EOF flag on the wire. */
+        result->count = count;
+        return count;
+out_cheating:
+        dprintk("NFS: server cheating in read result: "
+                "count %u > recvd %u\n", count, recvd);
+        count = recvd;
+        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      enum stat {
+ *              NFS_OK = 0,
+ *              NFSERR_PERM = 1,
+ *              NFSERR_NOENT = 2,
+ *              NFSERR_IO = 5,
+ *              NFSERR_NXIO = 6,
+ *              NFSERR_ACCES = 13,
+ *              NFSERR_EXIST = 17,
+ *              NFSERR_NODEV = 19,
+ *              NFSERR_NOTDIR = 20,
+ *              NFSERR_ISDIR = 21,
+ *              NFSERR_FBIG = 27,
+ *              NFSERR_NOSPC = 28,
+ *              NFSERR_ROFS = 30,
+ *              NFSERR_NAMETOOLONG = 63,
+ *              NFSERR_NOTEMPTY = 66,
+ *              NFSERR_DQUOT = 69,
+ *              NFSERR_STALE = 70,
+ *              NFSERR_WFLUSH = 99
+ *      };
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *status = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 /*
- * Common NFS XDR functions as inlines
+ * 2.3.2.  ftype
+ *
+ *      enum ftype {
+ *              NFNON = 0,
+ *              NFREG = 1,
+ *              NFDIR = 2,
+ *              NFBLK = 3,
+ *              NFCHR = 4,
+ *              NFLNK = 5
+ *      };
+ *
 */
-static inline __be32 *
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle)
 {
-        memcpy(p, fhandle->data, NFS2_FHSIZE);
+        *type = be32_to_cpup(p++);
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
+        if (unlikely(*type > NF2FIFO))
+                *type = NFBAD;
+        return p;
 }
-static inline __be32 *
+/*
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle)
+ * 2.3.3.  fhandle
+ *
+ *      typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
-        /* NFSv2 handles have a fixed length */
+        __be32 *p;
-        fhandle->size = NFS2_FHSIZE;
-        memcpy(fhandle->data, p, NFS2_FHSIZE);
+        BUG_ON(fh->size != NFS2_FHSIZE);
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
+        p = xdr_reserve_space(xdr, NFS2_FHSIZE);
+        memcpy(p, fh->data, NFS2_FHSIZE);
 }
-static inline __be32*
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
-xdr_encode_time(__be32 *p, struct timespec *timep)
 {
-        *p++ = htonl(timep->tv_sec);
+        __be32 *p;
-        /* Convert nanoseconds into microseconds */
-        *p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0);
+        p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fh->size = NFS2_FHSIZE;
+        memcpy(fh->data, p, NFS2_FHSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * 2.3.4.  timeval
+ *
+ *      struct timeval {
+ *              unsigned int seconds;
+ *              unsigned int useconds;
+ *      };
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
+{
+        *p++ = cpu_to_be32(timep->tv_sec);
+        if (timep->tv_nsec != 0)
+                *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+        else
+                *p++ = cpu_to_be32(0);
        return p;
 }
-static inline __be32*
+/*
-xdr_encode_current_server_time(__be32 *p, struct timespec *timep)
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time".  It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly.  See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+                                              const struct timespec *timep)
 {
-        /*
+        *p++ = cpu_to_be32(timep->tv_sec);
-         * Passing the invalid value useconds=1000000 is a
+        *p++ = cpu_to_be32(1000000);
-         * Sun convention for "set to current server time".
-         * It's needed to make permissions checks for the
-         * "touch" program across v2 mounts to Solaris and
-         * Irix boxes work correctly. See description of
-         * sattr in section 6.1 of "NFS Illustrated" by
-         * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
-         */
-        *p++ = htonl(timep->tv_sec);
-        *p++ = htonl(1000000);
        return p;
 }
-static inline __be32*
+static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
-xdr_decode_time(__be32 *p, struct timespec *timep)
 {
-        timep->tv_sec = ntohl(*p++);
+        timep->tv_sec = be32_to_cpup(p++);
-        /* Convert microseconds into nanoseconds */
+        timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
-        timep->tv_nsec = ntohl(*p++) * 1000;
        return p;
 }
-static __be32 *
+/*
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
+ * 2.3.5.  fattr
+ *
+ *      struct fattr {
+ *              ftype           type;
+ *              unsigned int    mode;
+ *              unsigned int    nlink;
+ *              unsigned int    uid;
+ *              unsigned int    gid;
+ *              unsigned int    size;
+ *              unsigned int    blocksize;
+ *              unsigned int    rdev;
+ *              unsigned int    blocks;
+ *              unsigned int    fsid;
+ *              unsigned int    fileid;
+ *              timeval         atime;
+ *              timeval         mtime;
+ *              timeval         ctime;
+ *      };
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
        u32 rdev, type;
-        type = ntohl(*p++);
+        __be32 *p;
-        fattr->mode = ntohl(*p++);
-        fattr->nlink = ntohl(*p++);
+        p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
-        fattr->uid = ntohl(*p++);
+        if (unlikely(p == NULL))
-        fattr->gid = ntohl(*p++);
+                goto out_overflow;
-        fattr->size = ntohl(*p++);
-        fattr->du.nfs2.blocksize = ntohl(*p++);
-        rdev = ntohl(*p++);
-        fattr->du.nfs2.blocks = ntohl(*p++);
-        fattr->fsid.major = ntohl(*p++);
-        fattr->fsid.minor = 0;
-        fattr->fileid = ntohl(*p++);
-        p = xdr_decode_time(p, &fattr->atime);
-        p = xdr_decode_time(p, &fattr->mtime);
-        p = xdr_decode_time(p, &fattr->ctime);
        fattr->valid |= NFS_ATTR_FATTR_V2;
+        p = xdr_decode_ftype(p, &type);
+        fattr->mode = be32_to_cpup(p++);
+        fattr->nlink = be32_to_cpup(p++);
+        fattr->uid = be32_to_cpup(p++);
+        fattr->gid = be32_to_cpup(p++);
+        fattr->size = be32_to_cpup(p++);
+        fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+        rdev = be32_to_cpup(p++);
        fattr->rdev = new_decode_dev(rdev);
-        if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
+        if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
                fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
                fattr->rdev = 0;
        }
-        return p;
+        fattr->du.nfs2.blocks = be32_to_cpup(p++);
+        fattr->fsid.major = be32_to_cpup(p++);
+        fattr->fsid.minor = 0;
+        fattr->fileid = be32_to_cpup(p++);
+        p = xdr_decode_time(p, &fattr->atime);
+        p = xdr_decode_time(p, &fattr->mtime);
+        xdr_decode_time(p, &fattr->ctime);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32 *
+/*
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
+ * 2.3.6.  sattr
-{
+ *
-        const __be32 not_set = __constant_htonl(0xFFFFFFFF);
+ *      struct sattr {
+ *              unsigned int    mode;
+ *              unsigned int    uid;
+ *              unsigned int    gid;
+ *              unsigned int    size;
+ *              timeval         atime;
+ *              timeval         mtime;
+ *      };
+ */
-        *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
+#define NFS2_SATTR_NOT_SET      (0xffffffff)
-        *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
-        *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set;
+static __be32 *xdr_time_not_set(__be32 *p)
-        *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set;
+{
+        *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        return p;
+}
-        if (attr->ia_valid & ATTR_ATIME_SET) {
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
+        if (attr->ia_valid & ATTR_MODE)
+                *p++ = cpu_to_be32(attr->ia_mode);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_UID)
+                *p++ = cpu_to_be32(attr->ia_uid);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_GID)
+                *p++ = cpu_to_be32(attr->ia_gid);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_SIZE)
+                *p++ = cpu_to_be32((u32)attr->ia_size);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_ATIME_SET)
                p = xdr_encode_time(p, &attr->ia_atime);
-        } else if (attr->ia_valid & ATTR_ATIME) {
+        else if (attr->ia_valid & ATTR_ATIME)
                p = xdr_encode_current_server_time(p, &attr->ia_atime);
-        } else {
+        else
-                *p++ = not_set;
+                p = xdr_time_not_set(p);
-                *p++ = not_set;
+        if (attr->ia_valid & ATTR_MTIME_SET)
-        }
+                xdr_encode_time(p, &attr->ia_mtime);
+        else if (attr->ia_valid & ATTR_MTIME)
-        if (attr->ia_valid & ATTR_MTIME_SET) {
+                xdr_encode_current_server_time(p, &attr->ia_mtime);
-                p = xdr_encode_time(p, &attr->ia_mtime);
+        else
-        } else if (attr->ia_valid & ATTR_MTIME) {
+                xdr_time_not_set(p);
-                p = xdr_encode_current_server_time(p, &attr->ia_mtime);
-        } else {
-                *p++ = not_set; 
-                *p++ = not_set;
-        }
-        return p;
 }
 /*
- * NFS encode functions
+ * 2.3.7.  filename
- */
+ *
-/*
+ *      typedef string filename<MAXNAMLEN>;
- * Encode file handle argument
- * GETATTR, READLINK, STATFS
 */
-static int
+static void encode_filename(struct xdr_stream *xdr,
-nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
+                            const char *name, u32 length)
 {
-        p = xdr_encode_fhandle(p, fh);
+        __be32 *p;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        BUG_ON(length > NFS2_MAXNAMLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+static int decode_filename_inline(struct xdr_stream *xdr,
+                                  const char **name, u32 *length)
+{
+        __be32 *p;
+        u32 count;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (count > NFS3_MAXNAMLEN)
+                goto out_nametoolong;
+        p = xdr_inline_decode(xdr, count);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *name = (const char *)p;
+        *length = count;
        return 0;
+out_nametoolong:
+        dprintk("NFS: returned filename too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode SETATTR arguments
+ * 2.3.8.  path
+ *
+ *      typedef string path<MAXPATHLEN>;
 */
-static int
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
-nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
+{
+        __be32 *p;
+        BUG_ON(length > NFS2_MAXPATHLEN);
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(length);
+        xdr_write_pages(xdr, pages, 0, length);
+}
+static int decode_path(struct xdr_stream *xdr)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        u32 length, recvd;
-        p = xdr_encode_sattr(p, args->sattr);
+        size_t hdrlen;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p);
+        if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+                goto out_size;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(length > recvd))
+                goto out_cheating;
+        xdr_read_pages(xdr, length);
+        xdr_terminate_string(xdr->buf, length);
        return 0;
+out_size:
+        dprintk("NFS: returned pathname too long: %u\n", length);
+        return -ENAMETOOLONG;
+out_cheating:
+        dprintk("NFS: server cheating in pathname result: "
+                "length %u > received %u\n", length, recvd);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode directory ops argument
+ * 2.3.9.  attrstat
- * LOOKUP, RMDIR
+ *
+ *      union attrstat switch (stat status) {
+ *      case NFS_OK:
+ *              fattr attributes;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
-nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        enum nfs_stat status;
-        p = xdr_encode_array(p, args->name, args->len);
+        int error;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_fattr(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Encode REMOVE argument
+ * 2.3.10.  diropargs
+ *
+ *      struct diropargs {
+ *              fhandle  dir;
+ *              filename name;
+ *      };
 */
-static int
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
-nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+                             const char *name, u32 length)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_fhandle(xdr, fh);
-        p = xdr_encode_array(p, args->name.name, args->name.len);
+        encode_filename(xdr, name, length);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Arguments to a READ call. Since we read data directly into the page
+ * 2.3.11.  diropres
- * cache, we also set up the reply iovec here so that iov[1] points
+ *
- * exactly to the page we want to fetch.
+ *      union diropres switch (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      fhandle file;
+ *                      fattr   attributes;
+ *              } diropok;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
-nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        int error;
-        unsigned int replen;
-        u32 offset = (u32)args->offset;
+        error = decode_fhandle(xdr, result->fh);
-        u32 count = args->count;
+        if (unlikely(error))
+                goto out;
-        p = xdr_encode_fhandle(p, args->fh);
+        error = decode_fattr(xdr, result->fattr);
-        *p++ = htonl(offset);
+out:
-        *p++ = htonl(count);
+        return error;
-        *p++ = htonl(count);
+}
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2;
+{
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
+        enum nfs_stat status;
-                         args->pages, args->pgbase, count);
+        int error;
-        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return 0;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_diropok(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READ reply
+ * NFSv2 XDR encode functions
+ *
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
 */
-static int
-nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
-{
-        struct kvec *iov = req->rq_rcv_buf.head;
-        size_t hdrlen;
-        u32 count, recvd;
-        int status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        p = xdr_decode_fattr(p, res->fattr);
-        count = ntohl(*p++);
-        res->eof = 0;
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READ reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READ header is short. iovec will be shifted.\n");
-                xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
-        if (count > recvd) {
+                                 struct xdr_stream *xdr,
-                dprintk("NFS: server cheating in read reply: "
+                                 const struct nfs_fh *fh)
-                        "count %u > recvd %u\n", count, recvd);
+{
-                count = recvd;
+        encode_fhandle(xdr, fh);
-        }
+}
-        dprintk("RPC:      readres OK count %u\n", count);
+/*
-        if (count < res->count)
+ * 2.2.3.  sattrargs
-                res->count = count;
+ *
+ *      struct sattrargs {
+ *              fhandle file;
+ *              sattr attributes;
+ *      };
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_sattrargs *args)
+{
+        encode_fhandle(xdr, args->fh);
+        encode_sattr(xdr, args->sattr);
+}
-        return count;
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_diropargs *args)
+{
+        encode_diropargs(xdr, args->fh, args->name, args->len);
 }
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs_readlinkargs *args)
+{
+        encode_fhandle(xdr, args->fh);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->pglen, NFS_readlinkres_sz);
+}
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 2.2.7.  readargs
+ *
+ *      struct readargs {
+ *              fhandle file;
+ *              unsigned offset;
+ *              unsigned count;
+ *              unsigned totalcount;
+ *      };
 */
-static int
+static void encode_readargs(struct xdr_stream *xdr,
-nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                            const struct nfs_readargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        u32 offset = args->offset;
-        u32 offset = (u32)args->offset;
        u32 count = args->count;
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_fhandle(xdr, args->fh);
-        *p++ = htonl(offset);
-        *p++ = htonl(offset);
-        *p++ = htonl(count);
-        *p++ = htonl(count);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-        /* Copy the page array */
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
-        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+        *p++ = cpu_to_be32(offset);
-        sndbuf->flags |= XDRBUF_WRITE;
+        *p++ = cpu_to_be32(count);
-        return 0;
+        *p = cpu_to_be32(count);
 }
-/*
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
- * Encode create arguments
+                                  struct xdr_stream *xdr,
- * CREATE, MKDIR
+                                  const struct nfs_readargs *args)
- */
-static int
-nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_readargs(xdr, args);
-        p = xdr_encode_array(p, args->name, args->len);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
-        p = xdr_encode_sattr(p, args->sattr);
+                                        args->count, NFS_readres_sz);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return 0;
 }
 /*
- * Encode RENAME arguments
+ * 2.2.9.  writeargs
+ *
+ *      struct writeargs {
+ *              fhandle file;
+ *              unsigned beginoffset;
+ *              unsigned offset;
+ *              unsigned totalcount;
+ *              nfsdata data;
+ *      };
 */
-static int
+static void encode_writeargs(struct xdr_stream *xdr,
-nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
+                             const struct nfs_writeargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        u32 offset = args->offset;
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+        u32 count = args->count;
-        p = xdr_encode_fhandle(p, args->tofh);
+        __be32 *p;
-        p = xdr_encode_array(p, args->toname, args->tolen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        encode_fhandle(xdr, args->fh);
-        return 0;
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+        *p++ = cpu_to_be32(offset);
+        *p++ = cpu_to_be32(offset);
+        *p++ = cpu_to_be32(count);
+        /* nfsdata */
+        *p = cpu_to_be32(count);
+        xdr_write_pages(xdr, args->pages, args->pgbase, count);
 }
-/*
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
- * Encode LINK arguments
+                                   struct xdr_stream *xdr,
- */
+                                   const struct nfs_writeargs *args)
-static int
-nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_writeargs(xdr, args);
-        p = xdr_encode_fhandle(p, args->tofh);
+        xdr->buf->flags |= XDRBUF_WRITE;
-        p = xdr_encode_array(p, args->toname, args->tolen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SYMLINK arguments
+ * 2.2.10.  createargs
+ *
+ *      struct createargs {
+ *              diropargs where;
+ *              sattr attributes;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
-nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_createargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        encode_diropargs(xdr, args->fh, args->name, args->len);
-        size_t pad;
+        encode_sattr(xdr, args->sattr);
+}
-        p = xdr_encode_fhandle(p, args->fromfh);
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+                                    struct xdr_stream *xdr,
-        *p++ = htonl(args->pathlen);
+                                    const struct nfs_removeargs *args)
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
+{
+        encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
+}
-        xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen);
+/*
+ * 2.2.12.  renameargs
+ *
+ *      struct renameargs {
+ *              diropargs from;
+ *              diropargs to;
+ *      };
+ */
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_renameargs *args)
+{
+        const struct qstr *old = args->old_name;
+        const struct qstr *new = args->new_name;
-        /*
+        encode_diropargs(xdr, args->old_dir, old->name, old->len);
-         * xdr_encode_pages may have added a few bytes to ensure the
+        encode_diropargs(xdr, args->new_dir, new->name, new->len);
-         * pathname ends on a 4-byte boundary.  Start encoding the
-         * attributes after the pad bytes.
-         */
-        pad = sndbuf->tail->iov_len;
-        if (pad > 0)
-                p++;
-        p = xdr_encode_sattr(p, args->sattr);
-        sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
-        return 0;
 }
 /*
- * Encode arguments to readdir call
+ * 2.2.13.  linkargs
+ *
+ *      struct linkargs {
+ *              fhandle from;
+ *              diropargs to;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
-nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
+                                  struct xdr_stream *xdr,
+                                  const struct nfs_linkargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_fhandle(xdr, args->fromfh);
-        unsigned int replen;
+        encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
-        u32 count = args->count;
+}
-        p = xdr_encode_fhandle(p, args->fh);
-        *p++ = htonl(args->cookie);
-        *p++ = htonl(count); /* see above */
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
+/*
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2;
+ * 2.2.14.  symlinkargs
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
+ *
-        return 0;
+ *      struct symlinkargs {
+ *              diropargs from;
+ *              path to;
+ *              sattr attributes;
+ *      };
+ */
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_symlinkargs *args)
+{
+        encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
+        encode_path(xdr, args->pages, args->pathlen);
+        encode_sattr(xdr, args->sattr);
 }
 /*
- * Decode the result of a readdir call.
+ * 2.2.17.  readdirargs
- * We're not really decoding anymore, we just leave the buffer untouched
+ *
- * and only check that it is syntactically correct.
+ *      struct readdirargs {
- * The real decoding happens in nfs_decode_entry below, called directly
+ *              fhandle dir;
- * from nfs_readdir for each entry.
+ *              nfscookie cookie;
+ *              unsigned count;
+ *      };
 */
-static int
+static void encode_readdirargs(struct xdr_stream *xdr,
-nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
+                               const struct nfs_readdirargs *args)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        struct page **page;
-        size_t hdrlen;
-        unsigned int pglen, recvd;
-        u32 len;
-        int status, nr = 0;
-        __be32 *end, *entry, *kaddr;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READDIR reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        pglen = rcvbuf->page_len;
+        encode_fhandle(xdr, args->fh);
-        recvd = rcvbuf->len - hdrlen;
-        if (pglen > recvd)
-                pglen = recvd;
-        page = rcvbuf->pages;
-        kaddr = p = kmap_atomic(*page, KM_USER0);
-        end = (__be32 *)((char *)p + pglen);
-        entry = p;
-        /* Make sure the packet actually has a value_follows and EOF entry */
-        if ((entry + 1) > end)
-                goto short_pkt;
-        for (; *p++; nr++) {
-                if (p + 2 > end)
-                        goto short_pkt;
-                p++; /* fileid */
-                len = ntohl(*p++);
-                p += XDR_QUADLEN(len) + 1;      /* name plus cookie */
-                if (len > NFS2_MAXNAMLEN) {
-                        dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
-                                                len);
-                        goto err_unmap;
-                }
-                if (p + 2 > end)
-                        goto short_pkt;
-                entry = p;
-        }
-        /*
+        p = xdr_reserve_space(xdr, 4 + 4);
-         * Apparently some server sends responses that are a valid size, but
+        *p++ = cpu_to_be32(args->cookie);
-         * contain no entries, and have value_follows==0 and EOF==0. For
+        *p = cpu_to_be32(args->count);
-         * those, just set the EOF marker.
-         */
-        if (!nr && entry[1] == 0) {
-                dprintk("NFS: readdir reply truncated!\n");
-                entry[1] = 1;
-        }
- out:
-        kunmap_atomic(kaddr, KM_USER0);
-        return nr;
- short_pkt:
-        /*
-         * When we get a short packet there are 2 possibilities. We can
-         * return an error, or fix up the response to look like a valid
-         * response and return what we have so far. If there are no
-         * entries and the packet was short, then return -EIO. If there
-         * are valid entries in the response, return them and pretend that
-         * the call was successful, but incomplete. The caller can retry the
-         * readdir starting at the last cookie.
-         */
-        entry[0] = entry[1] = 0;
-        if (!nr)
-                nr = -errno_NFSERR_IO;
-        goto out;
-err_unmap:
-        nr = -errno_NFSERR_IO;
-        goto out;
 }
-__be32 *
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
-nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_readdirargs *args)
 {
-        if (!*p++) {
+        encode_readdirargs(xdr, args);
-                if (!*p)
+        prepare_reply_buffer(req, args->pages, 0,
-                        return ERR_PTR(-EAGAIN);
+                                        args->count, NFS_readdirres_sz);
-                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
-        }
-        entry->ino        = ntohl(*p++);
-        entry->len        = ntohl(*p++);
-        entry->name       = (const char *) p;
-        p                += XDR_QUADLEN(entry->len);
-        entry->prev_cookie        = entry->cookie;
-        entry->cookie     = ntohl(*p++);
-        entry->eof        = !p[0] && p[1];
-        return p;
 }
 /*
- * NFS XDR decode functions
+ * NFSv2 XDR decode functions
- */
+ *
-/*
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
- * Decode simple status reply
+ * "NFS: Network File System Protocol Specification".
 */
-static int
-nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+                             void *__unused)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        if ((status = ntohl(*p++)) != 0)
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
-                status = nfs_stat_to_errno(status);
+                                 struct nfs_fattr *result)
-        return status;
+{
+        return decode_attrstat(xdr, result);
+}
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs_diropok *result)
+{
+        return decode_diropres(xdr, result);
 }
 /*
- * Decode attrstat reply
+ * 2.2.6.  readlinkres
- * GETATTR, SETATTR, WRITE
+ *
+ *      union readlinkres switch (stat status) {
+ *      case NFS_OK:
+ *              path data;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
-nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    struct xdr_stream *xdr, void *__unused)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
+        error = decode_stat(xdr, &status);
-        xdr_decode_fattr(p, fattr);
+        if (unlikely(error))
-        return 0;
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_path(xdr);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode diropres reply
+ * 2.2.7.  readres
- * LOOKUP, CREATE, MKDIR
+ *
+ *      union readres switch (stat status) {
+ *      case NFS_OK:
+ *              fattr attributes;
+ *              nfsdata data;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
+                                struct nfs_readres *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_fattr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_nfsdata(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        if ((status = ntohl(*p++)))
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
-                return nfs_stat_to_errno(status);
+                                 struct nfs_writeres *result)
-        p = xdr_decode_fhandle(p, res->fh);
+{
-        xdr_decode_fattr(p, res->fattr);
+        /* All NFSv2 writes are "file sync" writes */
-        return 0;
+        result->verf->committed = NFS_FILE_SYNC;
+        return decode_attrstat(xdr, result->fattr);
 }
-/*
+/**
- * Encode READLINK args
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17.  entry
+ *
+ *      struct entry {
+ *              unsigned        fileid;
+ *              filename        name;
+ *              nfscookie       cookie;
+ *              entry           *nextentry;
+ *      };
 */
-static int
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
-nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
+                       int plus)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        __be32 *p;
-        unsigned int replen;
+        int error;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p++ == xdr_zero) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(p == NULL))
+                        goto out_overflow;
+                if (*p++ == xdr_zero)
+                        return -EAGAIN;
+                entry->eof = 1;
+                return -EBADCOOKIE;
+        }
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        entry->ino = be32_to_cpup(p);
+        error = decode_filename_inline(xdr, &entry->name, &entry->len);
+        if (unlikely(error))
+                return error;
+        /*
+         * The type (size and byte order) of nfscookie isn't defined in
+         * RFC 1094.  This implementation assumes that it's an XDR uint32.
+         */
+        entry->prev_cookie = entry->cookie;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        entry->cookie = be32_to_cpup(p);
-        p = xdr_encode_fhandle(p, args->fh);
+        entry->d_type = DT_UNKNOWN;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EAGAIN;
 }
 /*
- * Decode READLINK reply
+ * 2.2.17.  readdirres
+ *
+ *      union readdirres switch (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      entry *entries;
+ *                      bool eof;
+ *              } readdirok;
+ *      default:
+ *              void;
+ *      };
+ *
+ * Read the directory contents into the page cache, but don't
+ * touch them.  The actual decoding is done by nfs2_decode_dirent()
+ * during subsequent nfs_readdir() calls.
 */
-static int
+static int decode_readdirok(struct xdr_stream *xdr)
-nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        u32 recvd, pglen;
-        struct kvec *iov = rcvbuf->head;
        size_t hdrlen;
-        u32 len, recvd;
-        char    *kaddr;
-        int     status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        /* Convert length of symlink */
-        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len) {
-                dprintk("nfs: server returned giant symlink!\n");
-                return -ENAMETOOLONG;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READLINK reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
-        if (recvd < len) {
-                dprintk("NFS: server cheating in readlink reply: "
-                                "count %u > recvd %u\n", len, recvd);
-                return -EIO;
-        }
-        /* NULL terminate the string we got */
+        pglen = xdr->buf->page_len;
-        kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        kaddr[len+rcvbuf->page_base] = '\0';
+        recvd = xdr->buf->len - hdrlen;
-        kunmap_atomic(kaddr, KM_USER0);
+        if (unlikely(pglen > recvd))
-        return 0;
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, pglen);
+        return pglen;
+out_cheating:
+        dprintk("NFS: server cheating in readdir result: "
+                "pglen %u > recvd %u\n", pglen, recvd);
+        pglen = recvd;
+        goto out;
 }
-/*
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
- * Decode WRITE reply
+                                   struct xdr_stream *xdr, void *__unused)
- */
-static int
-nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
-        res->verf->committed = NFS_FILE_SYNC;
+        enum nfs_stat status;
-        return nfs_xdr_attrstat(req, p, res->fattr);
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_readdirok(xdr);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode STATFS reply
+ * 2.2.18.  statfsres
+ *
+ *      union statfsres (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      unsigned tsize;
+ *                      unsigned bsize;
+ *                      unsigned blocks;
+ *                      unsigned bfree;
+ *                      unsigned bavail;
+ *              } info;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
-nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
 {
-        int     status;
+        __be32 *p;
-        if ((status = ntohl(*p++)))
+        p = xdr_inline_decode(xdr, NFS_info_sz << 2);
-                return nfs_stat_to_errno(status);
+        if (unlikely(p == NULL))
+                goto out_overflow;
-        res->tsize  = ntohl(*p++);
+        result->tsize  = be32_to_cpup(p++);
-        res->bsize  = ntohl(*p++);
+        result->bsize  = be32_to_cpup(p++);
-        res->blocks = ntohl(*p++);
+        result->blocks = be32_to_cpup(p++);
-        res->bfree  = ntohl(*p++);
+        result->bfree  = be32_to_cpup(p++);
-        res->bavail = ntohl(*p++);
+        result->bavail = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  struct nfs2_fsstat *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_info(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
 * We need to translate between nfs status return values and
 * the local errno values which may not be the same.
 */
-static struct {
+static const struct {
        int stat;
        int errno;
 } nfs_errtbl[] = {
@@ -703,28 +1102,30 @@ static struct {
        { -1,                   -EIO            }
 };
-/*
+/**
- * Convert an NFS error code to a local one.
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
- * This one is used jointly by NFSv2 and NFSv3.
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
 */
-int
+int nfs_stat_to_errno(enum nfs_stat status)
-nfs_stat_to_errno(int stat)
 {
        int i;
        for (i = 0; nfs_errtbl[i].stat != -1; i++) {
-                if (nfs_errtbl[i].stat == stat)
+                if (nfs_errtbl[i].stat == (int)status)
                        return nfs_errtbl[i].errno;
        }
-        dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
+        dprintk("NFS: Unrecognized nfs status value: %u\n", status);
        return nfs_errtbl[i].errno;
 }
 #define PROC(proc, argtype, restype, timer)                             \
 [NFSPROC_##proc] = {                                                    \
        .p_proc     =  NFSPROC_##proc,                                  \
-        .p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,                  \
+        .p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,             \
-        .p_decode   =  (kxdrproc_t) nfs_xdr_##restype,                  \
+        .p_decode   =  (kxdrdproc_t)nfs2_xdr_dec_##restype,             \
        .p_arglen   =  NFS_##argtype##_sz,                              \
        .p_replen   =  NFS_##restype##_sz,                              \
        .p_timer    =  timer,                                           \
@@ -732,21 +1133,21 @@ nfs_stat_to_errno(int stat)
        .p_name     =  #proc,                                           \
        }
 struct rpc_procinfo     nfs_procedures[] = {
-    PROC(GETATTR,       fhandle,        attrstat, 1),
+        PROC(GETATTR,   fhandle,        attrstat,       1),
-    PROC(SETATTR,       sattrargs,      attrstat, 0),
+        PROC(SETATTR,   sattrargs,      attrstat,       0),
-    PROC(LOOKUP,        diropargs,      diropres, 2),
+        PROC(LOOKUP,    diropargs,      diropres,       2),
-    PROC(READLINK,      readlinkargs,   readlinkres, 3),
+        PROC(READLINK,  readlinkargs,   readlinkres,    3),
-    PROC(READ,          readargs,       readres, 3),
+        PROC(READ,      readargs,       readres,        3),
-    PROC(WRITE,         writeargs,      writeres, 4),
+        PROC(WRITE,     writeargs,      writeres,       4),
-    PROC(CREATE,        createargs,     diropres, 0),
+        PROC(CREATE,    createargs,     diropres,       0),
-    PROC(REMOVE,        removeargs,     stat, 0),
+        PROC(REMOVE,    removeargs,     stat,           0),
-    PROC(RENAME,        renameargs,     stat, 0),
+        PROC(RENAME,    renameargs,     stat,           0),
-    PROC(LINK,          linkargs,       stat, 0),
+        PROC(LINK,      linkargs,       stat,           0),
-    PROC(SYMLINK,       symlinkargs,    stat, 0),
+        PROC(SYMLINK,   symlinkargs,    stat,           0),
-    PROC(MKDIR,         createargs,     diropres, 0),
+        PROC(MKDIR,     createargs,     diropres,       0),
-    PROC(RMDIR,         diropargs,      stat, 0),
+        PROC(RMDIR,     diropargs,      stat,           0),
-    PROC(READDIR,       readdirargs,    readdirres, 3),
+        PROC(READDIR,   readdirargs,    readdirres,     3),
-    PROC(STATFS,        fhandle,        statfsres, 0),
+        PROC(STATFS,    fhandle,        statfsres,      0),
 };
 struct rpc_version              nfs_version2 = {
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9f88c5f4c7e2..274342771655 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        if (!nfs_server_capable(inode, NFS_CAP_ACLS))
                goto out;
-        /* We are doing this here, because XDR marshalling can only
+        /* We are doing this here because XDR marshalling does not
-           return -ENOMEM. */
+         * return any results, it BUGs. */
        status = -ENOSPC;
        if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
                goto out;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index fabb4f2849a1..ce939c062a52 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
 */
 static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                 int flags, struct nameidata *nd)
+                 int flags, struct nfs_open_context *ctx)
 {
        struct nfs3_createdata *data;
        mode_t mode = sattr->ia_mode;
@@ -438,19 +438,38 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        return 1;
 }
+static void
+nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+        msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
+}
+static int
+nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+                      struct inode *new_dir)
+{
+        struct nfs_renameres *res;
+        if (nfs3_async_handle_jukebox(task, old_dir))
+                return 0;
+        res = task->tk_msg.rpc_resp;
+        nfs_post_op_update_inode(old_dir, res->old_fattr);
+        nfs_post_op_update_inode(new_dir, res->new_fattr);
+        return 1;
+}
 static int
 nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
                 struct inode *new_dir, struct qstr *new_name)
 {
-        struct nfs3_renameargs  arg = {
+        struct nfs_renameargs   arg = {
-                .fromfh         = NFS_FH(old_dir),
+                .old_dir        = NFS_FH(old_dir),
-                .fromname       = old_name->name,
+                .old_name       = old_name,
-                .fromlen        = old_name->len,
+                .new_dir        = NFS_FH(new_dir),
-                .tofh           = NFS_FH(new_dir),
+                .new_name       = new_name,
-                .toname         = new_name->name,
-                .tolen          = new_name->len
        };
-        struct nfs3_renameres res;
+        struct nfs_renameres res;
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_RENAME],
                .rpc_argp       = &arg,
@@ -460,17 +479,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
        dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
-        res.fromattr = nfs_alloc_fattr();
+        res.old_fattr = nfs_alloc_fattr();
-        res.toattr = nfs_alloc_fattr();
+        res.new_fattr = nfs_alloc_fattr();
-        if (res.fromattr == NULL || res.toattr == NULL)
+        if (res.old_fattr == NULL || res.new_fattr == NULL)
                goto out;
        status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-        nfs_post_op_update_inode(old_dir, res.fromattr);
+        nfs_post_op_update_inode(old_dir, res.old_fattr);
-        nfs_post_op_update_inode(new_dir, res.toattr);
+        nfs_post_op_update_inode(new_dir, res.new_fattr);
 out:
-        nfs_free_fattr(res.toattr);
+        nfs_free_fattr(res.old_fattr);
-        nfs_free_fattr(res.fromattr);
+        nfs_free_fattr(res.new_fattr);
        dprintk("NFS reply rename: %d\n", status);
        return status;
 }
@@ -611,7 +630,7 @@ out:
 */
 static int
 nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+                  u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
        __be32                  *verf = NFS_COOKIEVERF(dir);
@@ -621,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .verf           = {verf[0], verf[1]},
                .plus           = plus,
                .count          = count,
-                .pages          = &page
+                .pages          = pages
        };
        struct nfs3_readdirres  res = {
                .verf           = verf,
@@ -652,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        nfs_free_fattr(res.dir_attr);
 out:
-        dprintk("NFS reply readdir: %d\n", status);
+        dprintk("NFS reply readdir%s: %d\n",
+                        plus? "plus" : "", status);
        return status;
 }
@@ -722,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
        dprintk("NFS call  fsstat\n");
        nfs_fattr_init(stat->fattr);
        status = rpc_call_sync(server->client, &msg, 0);
-        dprintk("NFS reply statfs: %d\n", status);
+        dprintk("NFS reply fsstat: %d\n", status);
        return status;
 }
@@ -844,6 +864,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .unlink_setup   = nfs3_proc_unlink_setup,
        .unlink_done    = nfs3_proc_unlink_done,
        .rename         = nfs3_proc_rename,
+        .rename_setup   = nfs3_proc_rename_setup,
+        .rename_done    = nfs3_proc_rename_done,
        .link           = nfs3_proc_link,
        .symlink        = nfs3_proc_symlink,
        .mkdir          = nfs3_proc_mkdir,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9769704f8ce6..183c6b123d0f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -37,18 +37,16 @@
 #define NFS3_filename_sz        (1+(NFS3_MAXNAMLEN>>2))
 #define NFS3_path_sz            (1+(NFS3_MAXPATHLEN>>2))
 #define NFS3_fattr_sz           (21)
-#define NFS3_wcc_attr_sz                (6)
+#define NFS3_cookieverf_sz      (NFS3_COOKIEVERFSIZE>>2)
+#define NFS3_wcc_attr_sz        (6)
 #define NFS3_pre_op_attr_sz     (1+NFS3_wcc_attr_sz)
 #define NFS3_post_op_attr_sz    (1+NFS3_fattr_sz)
-#define NFS3_wcc_data_sz                (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
+#define NFS3_wcc_data_sz        (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
-#define NFS3_fsstat_sz          
-#define NFS3_fsinfo_sz          
-#define NFS3_pathconf_sz                
-#define NFS3_entry_sz           (NFS3_filename_sz+3)
-#define NFS3_sattrargs_sz       (NFS3_fh_sz+NFS3_sattr_sz+3)
 #define NFS3_diropargs_sz       (NFS3_fh_sz+NFS3_filename_sz)
-#define NFS3_removeargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_getattrargs_sz     (NFS3_fh_sz)
+#define NFS3_setattrargs_sz     (NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_lookupargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_accessargs_sz      (NFS3_fh_sz+1)
 #define NFS3_readlinkargs_sz    (NFS3_fh_sz)
 #define NFS3_readargs_sz        (NFS3_fh_sz+3)
@@ -57,14 +55,16 @@
 #define NFS3_mkdirargs_sz       (NFS3_diropargs_sz+NFS3_sattr_sz)
 #define NFS3_symlinkargs_sz     (NFS3_diropargs_sz+1+NFS3_sattr_sz)
 #define NFS3_mknodargs_sz       (NFS3_diropargs_sz+2+NFS3_sattr_sz)
+#define NFS3_removeargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_renameargs_sz      (NFS3_diropargs_sz+NFS3_diropargs_sz)
 #define NFS3_linkargs_sz                (NFS3_fh_sz+NFS3_diropargs_sz)
-#define NFS3_readdirargs_sz     (NFS3_fh_sz+2)
+#define NFS3_readdirargs_sz     (NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4)
 #define NFS3_commitargs_sz      (NFS3_fh_sz+3)
-#define NFS3_attrstat_sz        (1+NFS3_fattr_sz)
+#define NFS3_getattrres_sz      (1+NFS3_fattr_sz)
-#define NFS3_wccstat_sz         (1+NFS3_wcc_data_sz)
+#define NFS3_setattrres_sz      (1+NFS3_wcc_data_sz)
-#define NFS3_removeres_sz       (NFS3_wccstat_sz)
+#define NFS3_removeres_sz       (NFS3_setattrres_sz)
 #define NFS3_lookupres_sz       (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
 #define NFS3_accessres_sz       (1+NFS3_post_op_attr_sz+1)
 #define NFS3_readlinkres_sz     (1+NFS3_post_op_attr_sz+1)
@@ -101,1074 +101,2364 @@ static const umode_t nfs_type2fmt[] = {
 };
 /*
- * Common NFS XDR functions as inlines
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
 */
-static inline __be32 *
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh)
+                                 unsigned int base, unsigned int len,
+                                 unsigned int bufsize)
 {
-        return xdr_encode_array(p, fh->data, fh->size);
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        unsigned int replen;
+        replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
 }
-static inline __be32 *
+/*
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-        if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) {
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-                memcpy(fh->data, p, fh->size);
+                "Remaining buffer length is %tu words.\n",
-                return p + XDR_QUADLEN(fh->size);
+                func, xdr->end - xdr->p);
-        }
-        return NULL;
 }
 /*
- * Encode/decode time.
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
 */
-static inline __be32 *
-xdr_encode_time3(__be32 *p, struct timespec *timep)
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
 {
-        *p++ = htonl(timep->tv_sec);
+        __be32 *p = xdr_reserve_space(xdr, 4);
-        *p++ = htonl(timep->tv_nsec);
+        *p = cpu_to_be32(value);
-        return p;
 }
-static inline __be32 *
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
-xdr_decode_time3(__be32 *p, struct timespec *timep)
 {
-        timep->tv_sec = ntohl(*p++);
+        __be32 *p;
-        timep->tv_nsec = ntohl(*p++);
-        return p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *value = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static __be32 *
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-        unsigned int    type, major, minor;
+        __be32 *p;
-        umode_t         fmode;
-        type = ntohl(*p++);
+        p = xdr_inline_decode(xdr, 8);
-        if (type > NF3FIFO)
+        if (unlikely(p == NULL))
-                type = NF3NON;
+                goto out_overflow;
-        fmode = nfs_type2fmt[type];
+        xdr_decode_hyper(p, value);
-        fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
+        return 0;
-        fattr->nlink = ntohl(*p++);
+out_overflow:
-        fattr->uid = ntohl(*p++);
+        print_overflow_msg(__func__, xdr);
-        fattr->gid = ntohl(*p++);
+        return -EIO;
-        p = xdr_decode_hyper(p, &fattr->size);
+}
-        p = xdr_decode_hyper(p, &fattr->du.nfs3.used);
-        /* Turn remote device info into Linux-specific dev_t */
-        major = ntohl(*p++);
-        minor = ntohl(*p++);
-        fattr->rdev = MKDEV(major, minor);
-        if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
-                fattr->rdev = 0;
-        p = xdr_decode_hyper(p, &fattr->fsid.major);
+/*
-        fattr->fsid.minor = 0;
+ * fileid3
-        p = xdr_decode_hyper(p, &fattr->fileid);
+ *
-        p = xdr_decode_time3(p, &fattr->atime);
+ *      typedef uint64 fileid3;
-        p = xdr_decode_time3(p, &fattr->mtime);
+ */
-        p = xdr_decode_time3(p, &fattr->ctime);
+static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
+{
+        return xdr_decode_hyper(p, fileid);
+}
-        /* Update the mode bits */
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
-        fattr->valid |= NFS_ATTR_FATTR_V3;
+{
-        return p;
+        return decode_uint64(xdr, fileid);
 }
-static inline __be32 *
+/*
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
+ * filename3
+ *
+ *      typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+                             const char *name, u32 length)
 {
-        if (attr->ia_valid & ATTR_MODE) {
+        __be32 *p;
-                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_mode & S_IALLUGO);
+        BUG_ON(length > NFS3_MAXNAMLEN);
-        } else {
+        p = xdr_reserve_space(xdr, 4 + length);
-                *p++ = xdr_zero;
+        xdr_encode_opaque(p, name, length);
-        }
-        if (attr->ia_valid & ATTR_UID) {
-                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_uid);
-        } else {
-                *p++ = xdr_zero;
-        }
-        if (attr->ia_valid & ATTR_GID) {
-                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_gid);
-        } else {
-                *p++ = xdr_zero;
-        }
-        if (attr->ia_valid & ATTR_SIZE) {
-                *p++ = xdr_one;
-                p = xdr_encode_hyper(p, (__u64) attr->ia_size);
-        } else {
-                *p++ = xdr_zero;
-        }
-        if (attr->ia_valid & ATTR_ATIME_SET) {
-                *p++ = xdr_two;
-                p = xdr_encode_time3(p, &attr->ia_atime);
-        } else if (attr->ia_valid & ATTR_ATIME) {
-                *p++ = xdr_one;
-        } else {
-                *p++ = xdr_zero;
-        }
-        if (attr->ia_valid & ATTR_MTIME_SET) {
-                *p++ = xdr_two;
-                p = xdr_encode_time3(p, &attr->ia_mtime);
-        } else if (attr->ia_valid & ATTR_MTIME) {
-                *p++ = xdr_one;
-        } else {
-                *p++ = xdr_zero;
-        }
-        return p;
 }
-static inline __be32 *
+static int decode_inline_filename3(struct xdr_stream *xdr,
-xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
+                                   const char **name, u32 *length)
 {
-        p = xdr_decode_hyper(p, &fattr->pre_size);
+        __be32 *p;
-        p = xdr_decode_time3(p, &fattr->pre_mtime);
+        u32 count;
-        p = xdr_decode_time3(p, &fattr->pre_ctime);
-        fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+        p = xdr_inline_decode(xdr, 4);
-                | NFS_ATTR_FATTR_PREMTIME
+        if (unlikely(p == NULL))
-                | NFS_ATTR_FATTR_PRECTIME;
+                goto out_overflow;
-        return p;
+        count = be32_to_cpup(p);
+        if (count > NFS3_MAXNAMLEN)
+                goto out_nametoolong;
+        p = xdr_inline_decode(xdr, count);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *name = (const char *)p;
+        *length = count;
+        return 0;
+out_nametoolong:
+        dprintk("NFS: returned filename too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32 *
+/*
-xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
+ * nfspath3
+ *
+ *      typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+                            const u32 length)
 {
-        if (*p++)
+        BUG_ON(length > NFS3_MAXPATHLEN);
-                p = xdr_decode_fattr(p, fattr);
+        encode_uint32(xdr, length);
-        return p;
+        xdr_write_pages(xdr, pages, 0, length);
 }
-static inline __be32 *
+static int decode_nfspath3(struct xdr_stream *xdr)
-xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
-        if (*p++)
+        u32 recvd, count;
-                return xdr_decode_wcc_attr(p, fattr);
+        size_t hdrlen;
-        return p;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+                goto out_nametoolong;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+        xdr_read_pages(xdr, count);
+        xdr_terminate_string(xdr->buf, count);
+        return 0;
+out_nametoolong:
+        dprintk("NFS: returned pathname too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_cheating:
+        dprintk("NFS: server cheating in pathname result: "
+                "count %u > recvd %u\n", count, recvd);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
+/*
+ * cookie3
+ *
+ *      typedef uint64 cookie3
+ */
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
+{
+        return xdr_encode_hyper(p, cookie);
+}
-static inline __be32 *
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
-xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
 {
-        p = xdr_decode_pre_op_attr(p, fattr);
+        return decode_uint64(xdr, cookie);
-        return xdr_decode_post_op_attr(p, fattr);
 }
 /*
- * NFS encode functions
+ * cookieverf3
+ *
+ *      typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
 */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+        memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+        return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 /*
- * Encode file handle argument
+ * createverf3
+ *
+ *      typedef opaque createverf3[NFS3_CREATEVERFSIZE];
 */
-static int
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
-nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
 {
-        p = xdr_encode_fhandle(p, fh);
+        __be32 *p;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+        memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        memcpy(verifier, p, NFS3_WRITEVERFSIZE);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode SETATTR arguments
+ * size3
+ *
+ *      typedef uint64 size3;
 */
-static int
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
-nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
+{
-{
+        return xdr_decode_hyper(p, size);
-        p = xdr_encode_fhandle(p, args->fh);
-        p = xdr_encode_sattr(p, args->sattr);
-        *p++ = htonl(args->guard);
-        if (args->guard)
-                p = xdr_encode_time3(p, &args->guardtime);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode directory ops argument
+ * nfsstat3
+ *
+ *      enum nfsstat3 {
+ *              NFS3_OK = 0,
+ *              ...
+ *      }
 */
-static int
+#define NFS3_OK         NFS_OK
-nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        __be32 *p;
-        p = xdr_encode_array(p, args->name, args->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *status = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode REMOVE argument
+ * ftype3
+ *
+ *      enum ftype3 {
+ *              NF3REG  = 1,
+ *              NF3DIR  = 2,
+ *              NF3BLK  = 3,
+ *              NF3CHR  = 4,
+ *              NF3LNK  = 5,
+ *              NF3SOCK = 6,
+ *              NF3FIFO = 7
+ *      };
 */
-static int
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
-nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        BUG_ON(type > NF3FIFO);
-        p = xdr_encode_array(p, args->name.name, args->name.len);
+        encode_uint32(xdr, type);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+}
-        return 0;
+static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
+{
+        u32 type;
+        type = be32_to_cpup(p++);
+        if (type > NF3FIFO)
+                type = NF3NON;
+        *mode = nfs_type2fmt[type];
+        return p;
 }
 /*
- * Encode access() argument
+ * specdata3
+ *
+ *     struct specdata3 {
+ *             uint32  specdata1;
+ *             uint32  specdata2;
+ *     };
 */
-static int
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
-nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        __be32 *p;
-        *p++ = htonl(args->access);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        p = xdr_reserve_space(xdr, 8);
-        return 0;
+        *p++ = cpu_to_be32(MAJOR(rdev));
+        *p = cpu_to_be32(MINOR(rdev));
+}
+static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
+{
+        unsigned int major, minor;
+        major = be32_to_cpup(p++);
+        minor = be32_to_cpup(p++);
+        *rdev = MKDEV(major, minor);
+        if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
+                *rdev = 0;
+        return p;
 }
 /*
- * Arguments to a READ call. Since we read data directly into the page
+ * nfs_fh3
- * cache, we also set up the reply iovec here so that iov[1] points
+ *
- * exactly to the page we want to fetch.
+ *      struct nfs_fh3 {
+ *              opaque       data<NFS3_FHSIZE>;
+ *      };
 */
-static int
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
-nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        __be32 *p;
-        unsigned int replen;
-        u32 count = args->count;
-        p = xdr_encode_fhandle(p, args->fh);
+        BUG_ON(fh->size > NFS3_FHSIZE);
-        p = xdr_encode_hyper(p, args->offset);
+        p = xdr_reserve_space(xdr, 4 + fh->size);
-        *p++ = htonl(count);
+        xdr_encode_opaque(p, fh->data, fh->size);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+}
-        /* Inline the page array */
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2;
+{
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
+        u32 length;
-                         args->pages, args->pgbase, count);
+        __be32 *p;
-        req->rq_rcv_buf.flags |= XDRBUF_READ;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > NFS3_FHSIZE))
+                goto out_toobig;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fh->size = length;
+        memcpy(fh->data, p, length);
        return 0;
+out_toobig:
+        dprintk("NFS: file handle size (%u) too big\n", length);
+        return -E2BIG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+        memset(fh, 0, sizeof(*fh));
 }
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * nfstime3
+ *
+ *      struct nfstime3 {
+ *              uint32  seconds;
+ *              uint32  nseconds;
+ *      };
 */
-static int
+static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
-nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        *p++ = cpu_to_be32(timep->tv_sec);
-        u32 count = args->count;
+        *p++ = cpu_to_be32(timep->tv_nsec);
+        return p;
+}
-        p = xdr_encode_fhandle(p, args->fh);
+static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
-        p = xdr_encode_hyper(p, args->offset);
+{
-        *p++ = htonl(count);
+        timep->tv_sec = be32_to_cpup(p++);
-        *p++ = htonl(args->stable);
+        timep->tv_nsec = be32_to_cpup(p++);
-        *p++ = htonl(count);
+        return p;
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-        /* Copy the page array */
-        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
-        sndbuf->flags |= XDRBUF_WRITE;
-        return 0;
 }
 /*
- * Encode CREATE arguments
+ * sattr3
+ *
+ *      enum time_how {
+ *              DONT_CHANGE             = 0,
+ *              SET_TO_SERVER_TIME      = 1,
+ *              SET_TO_CLIENT_TIME      = 2
+ *      };
+ *
+ *      union set_mode3 switch (bool set_it) {
+ *      case TRUE:
+ *              mode3   mode;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_uid3 switch (bool set_it) {
+ *      case TRUE:
+ *              uid3    uid;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_gid3 switch (bool set_it) {
+ *      case TRUE:
+ *              gid3    gid;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_size3 switch (bool set_it) {
+ *      case TRUE:
+ *              size3   size;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_atime switch (time_how set_it) {
+ *      case SET_TO_CLIENT_TIME:
+ *              nfstime3        atime;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_mtime switch (time_how set_it) {
+ *      case SET_TO_CLIENT_TIME:
+ *              nfstime3  mtime;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct sattr3 {
+ *              set_mode3       mode;
+ *              set_uid3        uid;
+ *              set_gid3        gid;
+ *              set_size3       size;
+ *              set_atime       atime;
+ *              set_mtime       mtime;
+ *      };
 */
-static int
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
-nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        u32 nbytes;
-        p = xdr_encode_array(p, args->name, args->len);
+        __be32 *p;
-        *p++ = htonl(args->createmode);
+        /*
-        if (args->createmode == NFS3_CREATE_EXCLUSIVE) {
+         * In order to make only a single xdr_reserve_space() call,
-                *p++ = args->verifier[0];
+         * pre-compute the total number of bytes to be reserved.
-                *p++ = args->verifier[1];
+         * Six boolean values, one for each set_foo field, are always
+         * present in the encoded result, so start there.
+         */
+        nbytes = 6 * 4;
+        if (attr->ia_valid & ATTR_MODE)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_UID)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_GID)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_SIZE)
+                nbytes += 8;
+        if (attr->ia_valid & ATTR_ATIME_SET)
+                nbytes += 8;
+        if (attr->ia_valid & ATTR_MTIME_SET)
+                nbytes += 8;
+        p = xdr_reserve_space(xdr, nbytes);
+        if (attr->ia_valid & ATTR_MODE) {
+                *p++ = xdr_one;
+                *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
        } else
-                p = xdr_encode_sattr(p, args->sattr);
+                *p++ = xdr_zero;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (attr->ia_valid & ATTR_UID) {
-        return 0;
+                *p++ = xdr_one;
+                *p++ = cpu_to_be32(attr->ia_uid);
+        } else
+                *p++ = xdr_zero;
+        if (attr->ia_valid & ATTR_GID) {
+                *p++ = xdr_one;
+                *p++ = cpu_to_be32(attr->ia_gid);
+        } else
+                *p++ = xdr_zero;
+        if (attr->ia_valid & ATTR_SIZE) {
+                *p++ = xdr_one;
+                p = xdr_encode_hyper(p, (u64)attr->ia_size);
+        } else
+                *p++ = xdr_zero;
+        if (attr->ia_valid & ATTR_ATIME_SET) {
+                *p++ = xdr_two;
+                p = xdr_encode_nfstime3(p, &attr->ia_atime);
+        } else if (attr->ia_valid & ATTR_ATIME) {
+                *p++ = xdr_one;
+        } else
+                *p++ = xdr_zero;
+        if (attr->ia_valid & ATTR_MTIME_SET) {
+                *p++ = xdr_two;
+                xdr_encode_nfstime3(p, &attr->ia_mtime);
+        } else if (attr->ia_valid & ATTR_MTIME) {
+                *p = xdr_one;
+        } else
+                *p = xdr_zero;
 }
 /*
- * Encode MKDIR arguments
+ * fattr3
+ *
+ *      struct fattr3 {
+ *              ftype3          type;
+ *              mode3           mode;
+ *              uint32          nlink;
+ *              uid3            uid;
+ *              gid3            gid;
+ *              size3           size;
+ *              size3           used;
+ *              specdata3       rdev;
+ *              uint64          fsid;
+ *              fileid3         fileid;
+ *              nfstime3        atime;
+ *              nfstime3        mtime;
+ *              nfstime3        ctime;
+ *      };
 */
-static int
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
-nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        umode_t fmode;
-        p = xdr_encode_array(p, args->name, args->len);
+        __be32 *p;
-        p = xdr_encode_sattr(p, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        p = xdr_decode_ftype3(p, &fmode);
+        fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
+        fattr->nlink = be32_to_cpup(p++);
+        fattr->uid = be32_to_cpup(p++);
+        fattr->gid = be32_to_cpup(p++);
+        p = xdr_decode_size3(p, &fattr->size);
+        p = xdr_decode_size3(p, &fattr->du.nfs3.used);
+        p = xdr_decode_specdata3(p, &fattr->rdev);
+        p = xdr_decode_hyper(p, &fattr->fsid.major);
+        fattr->fsid.minor = 0;
+        p = xdr_decode_fileid3(p, &fattr->fileid);
+        p = xdr_decode_nfstime3(p, &fattr->atime);
+        p = xdr_decode_nfstime3(p, &fattr->mtime);
+        xdr_decode_nfstime3(p, &fattr->ctime);
+        fattr->valid |= NFS_ATTR_FATTR_V3;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode SYMLINK arguments
+ * post_op_attr
+ *
+ *      union post_op_attr switch (bool attributes_follow) {
+ *      case TRUE:
+ *              fattr3  attributes;
+ *      case FALSE:
+ *              void;
+ *      };
 */
-static int
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
-nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        __be32 *p;
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
-        p = xdr_encode_sattr(p, args->sattr);
-        *p++ = htonl(args->pathlen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Copy the page */
+        p = xdr_inline_decode(xdr, 4);
-        xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p != xdr_zero)
+                return decode_fattr3(xdr, fattr);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode MKNOD arguments
+ * wcc_attr
+ *      struct wcc_attr {
+ *              size3           size;
+ *              nfstime3        mtime;
+ *              nfstime3        ctime;
+ *      };
 */
-static int
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
-nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
+{
-{
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fh);
-        p = xdr_encode_array(p, args->name, args->len);
+        p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
-        *p++ = htonl(args->type);
+        if (unlikely(p == NULL))
-        p = xdr_encode_sattr(p, args->sattr);
+                goto out_overflow;
-        if (args->type == NF3CHR || args->type == NF3BLK) {
-                *p++ = htonl(MAJOR(args->rdev));
+        fattr->valid |= NFS_ATTR_FATTR_PRESIZE
-                *p++ = htonl(MINOR(args->rdev));
+                | NFS_ATTR_FATTR_PREMTIME
-        }
+                | NFS_ATTR_FATTR_PRECTIME;
+        p = xdr_decode_size3(p, &fattr->pre_size);
+        p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
+        xdr_decode_nfstime3(p, &fattr->pre_ctime);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode RENAME arguments
+ * pre_op_attr
+ *      union pre_op_attr switch (bool attributes_follow) {
+ *      case TRUE:
+ *              wcc_attr        attributes;
+ *      case FALSE:
+ *              void;
+ *      };
+ *
+ * wcc_data
+ *
+ *      struct wcc_data {
+ *              pre_op_attr     before;
+ *              post_op_attr    after;
+ *      };
 */
-static int
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
-nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args)
+{
-{
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fromfh);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+        p = xdr_inline_decode(xdr, 4);
-        p = xdr_encode_fhandle(p, args->tofh);
+        if (unlikely(p == NULL))
-        p = xdr_encode_array(p, args->toname, args->tolen);
+                goto out_overflow;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (*p != xdr_zero)
+                return decode_wcc_attr(xdr, fattr);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-/*
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
- * Encode LINK arguments
- */
-static int
-nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        int error;
-        p = xdr_encode_fhandle(p, args->tofh);
-        p = xdr_encode_array(p, args->toname, args->tolen);
+        error = decode_pre_op_attr(xdr, fattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (unlikely(error))
-        return 0;
+                goto out;
+        error = decode_post_op_attr(xdr, fattr);
+out:
+        return error;
 }
 /*
- * Encode arguments to readdir call
+ * post_op_fh3
+ *
+ *      union post_op_fh3 switch (bool handle_follows) {
+ *      case TRUE:
+ *              nfs_fh3  handle;
+ *      case FALSE:
+ *              void;
+ *      };
 */
-static int
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
-nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        __be32 *p = xdr_inline_decode(xdr, 4);
-        unsigned int replen;
+        if (unlikely(p == NULL))
-        u32 count = args->count;
+                goto out_overflow;
+        if (*p != xdr_zero)
-        p = xdr_encode_fhandle(p, args->fh);
+                return decode_nfs_fh3(xdr, fh);
-        p = xdr_encode_hyper(p, args->cookie);
+        zero_nfs_fh3(fh);
-        *p++ = args->verf[0];
-        *p++ = args->verf[1];
-        if (args->plus) {
-                /* readdirplus: need dircount + buffer size.
-                 * We just make sure we make dircount big enough */
-                *p++ = htonl(count >> 3);
-        }
-        *p++ = htonl(count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Decode the result of a readdir call.
+ * diropargs3
- * We just check for syntactical correctness.
+ *
+ *      struct diropargs3 {
+ *              nfs_fh3         dir;
+ *              filename3       name;
+ *      };
 */
-static int
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
-nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res)
+                              const char *name, u32 length)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        encode_nfs_fh3(xdr, fh);
-        struct kvec *iov = rcvbuf->head;
+        encode_filename3(xdr, name, length);
-        struct page **page;
+}
-        size_t hdrlen;
-        u32 len, recvd, pglen;
-        int status, nr = 0;
-        __be32 *entry, *end, *kaddr;
-        status = ntohl(*p++);
-        /* Decode post_op_attrs */
-        p = xdr_decode_post_op_attr(p, res->dir_attr);
-        if (status)
-                return nfs_stat_to_errno(status);
-        /* Decode verifier cookie */
-        if (res->verf) {
-                res->verf[0] = *p++;
-                res->verf[1] = *p++;
-        } else {
-                p += 2;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READDIR reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        pglen = rcvbuf->page_len;
-        recvd = rcvbuf->len - hdrlen;
-        if (pglen > recvd)
-                pglen = recvd;
-        page = rcvbuf->pages;
-        kaddr = p = kmap_atomic(*page, KM_USER0);
-        end = (__be32 *)((char *)p + pglen);
-        entry = p;
-        /* Make sure the packet actually has a value_follows and EOF entry */
-        if ((entry + 1) > end)
-                goto short_pkt;
-        for (; *p++; nr++) {
-                if (p + 3 > end)
-                        goto short_pkt;
-                p += 2;                         /* inode # */
-                len = ntohl(*p++);              /* string length */
-                p += XDR_QUADLEN(len) + 2;      /* name + cookie */
-                if (len > NFS3_MAXNAMLEN) {
-                        dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
-                                                len);
-                        goto err_unmap;
-                }
-                if (res->plus) {
-                        /* post_op_attr */
-                        if (p + 2 > end)
-                                goto short_pkt;
-                        if (*p++) {
-                                p += 21;
-                                if (p + 1 > end)
-                                        goto short_pkt;
-                        }
-                        /* post_op_fh3 */
-                        if (*p++) {
-                                if (p + 1 > end)
-                                        goto short_pkt;
-                                len = ntohl(*p++);
-                                if (len > NFS3_FHSIZE) {
-                                        dprintk("NFS: giant filehandle in "
-                                                "readdir (len 0x%x)!\n", len);
-                                        goto err_unmap;
-                                }
-                                p += XDR_QUADLEN(len);
-                        }
-                }
-                if (p + 2 > end)
+/*
-                        goto short_pkt;
+ * NFSv3 XDR encode functions
-                entry = p;
+ *
-        }
+ * NFSv3 argument types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ */
-        /*
+/*
-         * Apparently some server sends responses that are a valid size, but
+ * 3.3.1  GETATTR3args
-         * contain no entries, and have value_follows==0 and EOF==0. For
+ *
-         * those, just set the EOF marker.
+ *      struct GETATTR3args {
-         */
+ *              nfs_fh3  object;
-        if (!nr && entry[1] == 0) {
+ *      };
-                dprintk("NFS: readdir reply truncated!\n");
+ */
-                entry[1] = 1;
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
-        }
+                                      struct xdr_stream *xdr,
- out:
+                                      const struct nfs_fh *fh)
-        kunmap_atomic(kaddr, KM_USER0);
+{
-        return nr;
+        encode_nfs_fh3(xdr, fh);
- short_pkt:
-        /*
-         * When we get a short packet there are 2 possibilities. We can
-         * return an error, or fix up the response to look like a valid
-         * response and return what we have so far. If there are no
-         * entries and the packet was short, then return -EIO. If there
-         * are valid entries in the response, return them and pretend that
-         * the call was successful, but incomplete. The caller can retry the
-         * readdir starting at the last cookie.
-         */
-        entry[0] = entry[1] = 0;
-        if (!nr)
-                nr = -errno_NFSERR_IO;
-        goto out;
-err_unmap:
-        nr = -errno_NFSERR_IO;
-        goto out;
 }
-__be32 *
+/*
-nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+ * 3.3.2  SETATTR3args
+ *
+ *      union sattrguard3 switch (bool check) {
+ *      case TRUE:
+ *              nfstime3  obj_ctime;
+ *      case FALSE:
+ *              void;
+ *      };
+ *
+ *      struct SETATTR3args {
+ *              nfs_fh3         object;
+ *              sattr3          new_attributes;
+ *              sattrguard3     guard;
+ *      };
+ */
+static void encode_sattrguard3(struct xdr_stream *xdr,
+                               const struct nfs3_sattrargs *args)
 {
-        struct nfs_entry old = *entry;
+        __be32 *p;
-        if (!*p++) {
-                if (!*p)
-                        return ERR_PTR(-EAGAIN);
-                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
-        }
-        p = xdr_decode_hyper(p, &entry->ino);
-        entry->len  = ntohl(*p++);
-        entry->name = (const char *) p;
-        p += XDR_QUADLEN(entry->len);
-        entry->prev_cookie = entry->cookie;
-        p = xdr_decode_hyper(p, &entry->cookie);
-        if (plus) {
+        if (args->guard) {
-                entry->fattr->valid = 0;
+                p = xdr_reserve_space(xdr, 4 + 8);
-                p = xdr_decode_post_op_attr(p, entry->fattr);
+                *p++ = xdr_one;
-                /* In fact, a post_op_fh3: */
+                xdr_encode_nfstime3(p, &args->guardtime);
-                if (*p++) {
+        } else {
-                        p = xdr_decode_fhandle(p, entry->fh);
+                p = xdr_reserve_space(xdr, 4);
-                        /* Ugh -- server reply was truncated */
+                *p = xdr_zero;
-                        if (p == NULL) {
-                                dprintk("NFS: FH truncated\n");
-                                *entry = old;
-                                return ERR_PTR(-EAGAIN);
-                        }
-                } else
-                        memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
        }
+}
-        entry->eof = !p[0] && p[1];
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
-        return p;
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_sattrargs *args)
+{
+        encode_nfs_fh3(xdr, args->fh);
+        encode_sattr3(xdr, args->sattr);
+        encode_sattrguard3(xdr, args);
 }
 /*
- * Encode COMMIT arguments
+ * 3.3.3  LOOKUP3args
+ *
+ *      struct LOOKUP3args {
+ *              diropargs3  what;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
-nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_diropargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
-        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(args->count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
-#ifdef CONFIG_NFS_V3_ACL
 /*
- * Encode GETACL arguments
+ * 3.3.4  ACCESS3args
+ *
+ *      struct ACCESS3args {
+ *              nfs_fh3         object;
+ *              uint32          access;
+ *      };
 */
-static int
+static void encode_access3args(struct xdr_stream *xdr,
-nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
+                               const struct nfs3_accessargs *args)
-                    struct nfs3_getaclargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_nfs_fh3(xdr, args->fh);
-        unsigned int replen;
+        encode_uint32(xdr, args->access);
+}
-        p = xdr_encode_fhandle(p, args->fh);
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
-        *p++ = htonl(args->mask);
+                                     struct xdr_stream *xdr,
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+                                     const struct nfs3_accessargs *args)
+{
+        encode_access3args(xdr, args);
+}
-        if (args->mask & (NFS_ACL | NFS_DFACL)) {
+/*
-                /* Inline the page array */
+ * 3.3.5  READLINK3args
-                replen = (RPC_REPHDRSIZE + auth->au_rslack +
+ *
-                          ACL3_getaclres_sz) << 2;
+ *      struct READLINK3args {
-                xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0,
+ *              nfs_fh3 symlink;
-                                 NFSACL_MAXPAGES << PAGE_SHIFT);
+ *      };
-        }
+ */
-        return 0;
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
+                                       struct xdr_stream *xdr,
+                                       const struct nfs3_readlinkargs *args)
+{
+        encode_nfs_fh3(xdr, args->fh);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->pglen, NFS3_readlinkres_sz);
 }
 /*
- * Encode SETACL arguments
+ * 3.3.6  READ3args
+ *
+ *      struct READ3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_read3args(struct xdr_stream *xdr,
-nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
+                             const struct nfs_readargs *args)
-                   struct nfs3_setaclargs *args)
 {
-        struct xdr_buf *buf = &req->rq_snd_buf;
+        __be32 *p;
-        unsigned int base;
-        int err;
-        p = xdr_encode_fhandle(p, NFS_FH(args->inode));
+        encode_nfs_fh3(xdr, args->fh);
-        *p++ = htonl(args->mask);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        base = req->rq_slen;
-        if (args->npages != 0)
+        p = xdr_reserve_space(xdr, 8 + 4);
-                xdr_encode_pages(buf, args->pages, 0, args->len);
+        p = xdr_encode_hyper(p, args->offset);
-        else
+        *p = cpu_to_be32(args->count);
-                req->rq_slen = xdr_adjust_iovec(req->rq_svec,
+}
-                                p + XDR_QUADLEN(args->len));
-        err = nfsacl_encode(buf, base, args->inode,
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
-                            (args->mask & NFS_ACL) ?
+                                   struct xdr_stream *xdr,
-                            args->acl_access : NULL, 1, 0);
+                                   const struct nfs_readargs *args)
-        if (err > 0)
+{
-                err = nfsacl_encode(buf, base + err, args->inode,
+        encode_read3args(xdr, args);
-                                    (args->mask & NFS_DFACL) ?
+        prepare_reply_buffer(req, args->pages, args->pgbase,
-                                    args->acl_default : NULL, 1,
+                                        args->count, NFS3_readres_sz);
-                                    NFS_ACL_DEFAULT);
+        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return (err > 0) ? 0 : err;
 }
-#endif  /* CONFIG_NFS_V3_ACL */
 /*
- * NFS XDR decode functions
+ * 3.3.7  WRITE3args
+ *
+ *      enum stable_how {
+ *              UNSTABLE  = 0,
+ *              DATA_SYNC = 1,
+ *              FILE_SYNC = 2
+ *      };
+ *
+ *      struct WRITE3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *              stable_how      stable;
+ *              opaque          data<>;
+ *      };
 */
+static void encode_write3args(struct xdr_stream *xdr,
+                              const struct nfs_writeargs *args)
+{
+        __be32 *p;
+        encode_nfs_fh3(xdr, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
+        p = xdr_encode_hyper(p, args->offset);
+        *p++ = cpu_to_be32(args->count);
+        *p++ = cpu_to_be32(args->stable);
+        *p = cpu_to_be32(args->count);
+        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_writeargs *args)
+{
+        encode_write3args(xdr, args);
+        xdr->buf->flags |= XDRBUF_WRITE;
+}
 /*
- * Decode attrstat reply.
+ * 3.3.8  CREATE3args
+ *
+ *      enum createmode3 {
+ *              UNCHECKED = 0,
+ *              GUARDED   = 1,
+ *              EXCLUSIVE = 2
+ *      };
+ *
+ *      union createhow3 switch (createmode3 mode) {
+ *      case UNCHECKED:
+ *      case GUARDED:
+ *              sattr3       obj_attributes;
+ *      case EXCLUSIVE:
+ *              createverf3  verf;
+ *      };
+ *
+ *      struct CREATE3args {
+ *              diropargs3      where;
+ *              createhow3      how;
+ *      };
 */
-static int
+static void encode_createhow3(struct xdr_stream *xdr,
-nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                              const struct nfs3_createargs *args)
 {
-        int     status;
+        encode_uint32(xdr, args->createmode);
+        switch (args->createmode) {
+        case NFS3_CREATE_UNCHECKED:
+        case NFS3_CREATE_GUARDED:
+                encode_sattr3(xdr, args->sattr);
+                break;
+        case NFS3_CREATE_EXCLUSIVE:
+                encode_createverf3(xdr, args->verifier);
+                break;
+        default:
+                BUG();
+        }
+}
-        if ((status = ntohl(*p++)))
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
-                return nfs_stat_to_errno(status);
+                                     struct xdr_stream *xdr,
-        xdr_decode_fattr(p, fattr);
+                                     const struct nfs3_createargs *args)
-        return 0;
+{
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_createhow3(xdr, args);
 }
 /*
- * Decode status+wcc_data reply
+ * 3.3.9  MKDIR3args
- * SATTR, REMOVE, RMDIR
+ *
+ *      struct MKDIR3args {
+ *              diropargs3      where;
+ *              sattr3          attributes;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
-nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs3_mkdirargs *args)
 {
-        int     status;
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_sattr3(xdr, args->sattr);
+}
-        if ((status = ntohl(*p++)))
+/*
-                status = nfs_stat_to_errno(status);
+ * 3.3.10  SYMLINK3args
-        xdr_decode_wcc_data(p, fattr);
+ *
-        return status;
+ *      struct symlinkdata3 {
+ *              sattr3          symlink_attributes;
+ *              nfspath3        symlink_data;
+ *      };
+ *
+ *      struct SYMLINK3args {
+ *              diropargs3      where;
+ *              symlinkdata3    symlink;
+ *      };
+ */
+static void encode_symlinkdata3(struct xdr_stream *xdr,
+                                const struct nfs3_symlinkargs *args)
+{
+        encode_sattr3(xdr, args->sattr);
+        encode_nfspath3(xdr, args->pages, args->pathlen);
 }
-static int
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
-nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_symlinkargs *args)
 {
-        return nfs3_xdr_wccstat(req, p, res->dir_attr);
+        encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+        encode_symlinkdata3(xdr, args);
 }
 /*
- * Decode LOOKUP reply
+ * 3.3.11  MKNOD3args
+ *
+ *      struct devicedata3 {
+ *              sattr3          dev_attributes;
+ *              specdata3       spec;
+ *      };
+ *
+ *      union mknoddata3 switch (ftype3 type) {
+ *      case NF3CHR:
+ *      case NF3BLK:
+ *              devicedata3     device;
+ *      case NF3SOCK:
+ *      case NF3FIFO:
+ *              sattr3          pipe_attributes;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct MKNOD3args {
+ *              diropargs3      where;
+ *              mknoddata3      what;
+ *      };
 */
-static int
+static void encode_devicedata3(struct xdr_stream *xdr,
-nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+                               const struct nfs3_mknodargs *args)
 {
-        int     status;
+        encode_sattr3(xdr, args->sattr);
+        encode_specdata3(xdr, args->rdev);
+}
-        if ((status = ntohl(*p++))) {
+static void encode_mknoddata3(struct xdr_stream *xdr,
-                status = nfs_stat_to_errno(status);
+                              const struct nfs3_mknodargs *args)
-        } else {
+{
-                if (!(p = xdr_decode_fhandle(p, res->fh)))
+        encode_ftype3(xdr, args->type);
-                        return -errno_NFSERR_IO;
+        switch (args->type) {
-                p = xdr_decode_post_op_attr(p, res->fattr);
+        case NF3CHR:
+        case NF3BLK:
+                encode_devicedata3(xdr, args);
+                break;
+        case NF3SOCK:
+        case NF3FIFO:
+                encode_sattr3(xdr, args->sattr);
+                break;
+        case NF3REG:
+        case NF3DIR:
+                break;
+        default:
+                BUG();
        }
-        xdr_decode_post_op_attr(p, res->dir_attr);
+}
-        return status;
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    const struct nfs3_mknodargs *args)
+{
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_mknoddata3(xdr, args);
 }
 /*
- * Decode ACCESS reply
+ * 3.3.12  REMOVE3args
+ *
+ *      struct REMOVE3args {
+ *              diropargs3  object;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
-nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_removeargs *args)
 {
-        int     status = ntohl(*p++);
+        encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
+}
-        p = xdr_decode_post_op_attr(p, res->fattr);
+/*
-        if (status)
+ * 3.3.14  RENAME3args
-                return nfs_stat_to_errno(status);
+ *
-        res->access = ntohl(*p++);
+ *      struct RENAME3args {
-        return 0;
+ *              diropargs3      from;
+ *              diropargs3      to;
+ *      };
+ */
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_renameargs *args)
+{
+        const struct qstr *old = args->old_name;
+        const struct qstr *new = args->new_name;
+        encode_diropargs3(xdr, args->old_dir, old->name, old->len);
+        encode_diropargs3(xdr, args->new_dir, new->name, new->len);
 }
-static int
+/*
-nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
+ * 3.3.15  LINK3args
+ *
+ *      struct LINK3args {
+ *              nfs_fh3         file;
+ *              diropargs3      link;
+ *      };
+ */
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs3_linkargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_nfs_fh3(xdr, args->fromfh);
-        unsigned int replen;
+        encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
+}
-        p = xdr_encode_fhandle(p, args->fh);
+/*
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+ * 3.3.16  READDIR3args
+ *
+ *      struct READDIR3args {
+ *              nfs_fh3         dir;
+ *              cookie3         cookie;
+ *              cookieverf3     cookieverf;
+ *              count3          count;
+ *      };
+ */
+static void encode_readdir3args(struct xdr_stream *xdr,
+                                const struct nfs3_readdirargs *args)
+{
+        __be32 *p;
-        /* Inline the page array */
+        encode_nfs_fh3(xdr, args->fh);
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
+        p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
-        return 0;
+        p = xdr_encode_cookie3(p, args->cookie);
+        p = xdr_encode_cookieverf3(p, args->verf);
+        *p = cpu_to_be32(args->count);
+}
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_readdirargs *args)
+{
+        encode_readdir3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, 0,
+                                args->count, NFS3_readdirres_sz);
 }
 /*
- * Decode READLINK reply
+ * 3.3.17  READDIRPLUS3args
+ *
+ *      struct READDIRPLUS3args {
+ *              nfs_fh3         dir;
+ *              cookie3         cookie;
+ *              cookieverf3     cookieverf;
+ *              count3          dircount;
+ *              count3          maxcount;
+ *      };
 */
-static int
+static void encode_readdirplus3args(struct xdr_stream *xdr,
-nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    const struct nfs3_readdirargs *args)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        size_t hdrlen;
-        u32 len, recvd;
-        char    *kaddr;
-        int     status;
-        status = ntohl(*p++);
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_decode_post_op_attr(p, fattr);
-        if (status != 0)
+        p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
-                return nfs_stat_to_errno(status);
+        p = xdr_encode_cookie3(p, args->cookie);
+        p = xdr_encode_cookieverf3(p, args->verf);
-        /* Convert length of symlink */
+        /*
-        len = ntohl(*p++);
+         * readdirplus: need dircount + buffer size.
-        if (len >= rcvbuf->page_len) {
+         * We just make sure we make dircount big enough
-                dprintk("nfs: server returned giant symlink!\n");
+         */
-                return -ENAMETOOLONG;
+        *p++ = cpu_to_be32(args->count >> 3);
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        *p = cpu_to_be32(args->count);
-        if (iov->iov_len < hdrlen) {
+}
-                dprintk("NFS: READLINK reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READLINK header is short. "
-                        "iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
-        if (recvd < len) {
-                dprintk("NFS: server cheating in readlink reply: "
-                                "count %u > recvd %u\n", len, recvd);
-                return -EIO;
-        }
-        /* NULL terminate the string we got */
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
-        kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0);
+                                          struct xdr_stream *xdr,
-        kaddr[len+rcvbuf->page_base] = '\0';
+                                          const struct nfs3_readdirargs *args)
-        kunmap_atomic(kaddr, KM_USER0);
+{
-        return 0;
+        encode_readdirplus3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, 0,
+                                args->count, NFS3_readdirres_sz);
 }
 /*
- * Decode READ reply
+ * 3.3.21  COMMIT3args
+ *
+ *      struct COMMIT3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_commit3args(struct xdr_stream *xdr,
-nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+                               const struct nfs_writeargs *args)
 {
-        struct kvec *iov = req->rq_rcv_buf.head;
+        __be32 *p;
-        size_t hdrlen;
-        u32 count, ocount, recvd;
-        int status;
-        status = ntohl(*p++);
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
+        p = xdr_reserve_space(xdr, 8 + 4);
-                return nfs_stat_to_errno(status);
+        p = xdr_encode_hyper(p, args->offset);
+        *p = cpu_to_be32(args->count);
+}
-        /* Decode reply count and EOF flag. NFSv3 is somewhat redundant
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
-         * in that it puts the count both in the res struct and in the
+                                     struct xdr_stream *xdr,
-         * opaque data count. */
+                                     const struct nfs_writeargs *args)
-        count    = ntohl(*p++);
+{
-        res->eof = ntohl(*p++);
+        encode_commit3args(xdr, args);
-        ocount   = ntohl(*p++);
+}
-        if (ocount != count) {
+#ifdef CONFIG_NFS_V3_ACL
-                dprintk("NFS: READ count doesn't match RPC opaque count.\n");
-                return -errno_NFSERR_IO;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
-        if (iov->iov_len < hdrlen) {
+                                     struct xdr_stream *xdr,
-                dprintk("NFS: READ reply header overflowed:"
+                                     const struct nfs3_getaclargs *args)
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+{
-                return -errno_NFSERR_IO;
+        encode_nfs_fh3(xdr, args->fh);
-        } else if (iov->iov_len != hdrlen) {
+        encode_uint32(xdr, args->mask);
-                dprintk("NFS: READ header is short. iovec will be shifted.\n");
+        if (args->mask & (NFS_ACL | NFS_DFACL))
-                xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
+                prepare_reply_buffer(req, args->pages, 0,
-        }
+                                        NFSACL_MAXPAGES << PAGE_SHIFT,
+                                        ACL3_getaclres_sz);
+}
-        recvd = req->rq_rcv_buf.len - hdrlen;
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
-        if (count > recvd) {
+                                     struct xdr_stream *xdr,
-                dprintk("NFS: server cheating in read reply: "
+                                     const struct nfs3_setaclargs *args)
-                        "count %u > recvd %u\n", count, recvd);
+{
-                count = recvd;
+        unsigned int base;
-                res->eof = 0;
+        int error;
-        }
-        if (count < res->count)
+        encode_nfs_fh3(xdr, NFS_FH(args->inode));
-                res->count = count;
+        encode_uint32(xdr, args->mask);
-        return count;
+        base = req->rq_slen;
+        if (args->npages != 0)
+                xdr_write_pages(xdr, args->pages, 0, args->len);
+        else
+                xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
+        error = nfsacl_encode(xdr->buf, base, args->inode,
+                            (args->mask & NFS_ACL) ?
+                            args->acl_access : NULL, 1, 0);
+        BUG_ON(error < 0);
+        error = nfsacl_encode(xdr->buf, base + error, args->inode,
+                            (args->mask & NFS_DFACL) ?
+                            args->acl_default : NULL, 1,
+                            NFS_ACL_DEFAULT);
+        BUG_ON(error < 0);
 }
+#endif  /* CONFIG_NFS_V3_ACL */
 /*
- * Decode WRITE response
+ * NFSv3 XDR decode functions
+ *
+ * NFSv3 result types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
 */
-static int
-nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
-{
-        int     status;
-        status = ntohl(*p++);
+/*
-        p = xdr_decode_wcc_data(p, res->fattr);
+ * 3.3.1  GETATTR3res
+ *
+ *      struct GETATTR3resok {
+ *              fattr3          obj_attributes;
+ *      };
+ *
+ *      union GETATTR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              GETATTR3resok  resok;
+ *      default:
+ *              void;
+ *      };
+ */
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    struct nfs_fattr *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_fattr3(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        if (status != 0)
+/*
-                return nfs_stat_to_errno(status);
+ * 3.3.2  SETATTR3res
+ *
+ *      struct SETATTR3resok {
+ *              wcc_data  obj_wcc;
+ *      };
+ *
+ *      struct SETATTR3resfail {
+ *              wcc_data  obj_wcc;
+ *      };
+ *
+ *      union SETATTR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              SETATTR3resok   resok;
+ *      default:
+ *              SETATTR3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    struct nfs_fattr *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        res->count = ntohl(*p++);
+/*
-        res->verf->committed = (enum nfs3_stable_how)ntohl(*p++);
+ * 3.3.3  LOOKUP3res
-        res->verf->verifier[0] = *p++;
+ *
-        res->verf->verifier[1] = *p++;
+ *      struct LOOKUP3resok {
+ *              nfs_fh3         object;
+ *              post_op_attr    obj_attributes;
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      struct LOOKUP3resfail {
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      union LOOKUP3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              LOOKUP3resok    resok;
+ *      default:
+ *              LOOKUP3resfail  resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_diropres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_nfs_fh3(xdr, result->fh);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->dir_attr);
+out:
+        return error;
+out_default:
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
+}
-        return res->count;
+/*
+ * 3.3.4  ACCESS3res
+ *
+ *      struct ACCESS3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          access;
+ *      };
+ *
+ *      struct ACCESS3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union ACCESS3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              ACCESS3resok    resok;
+ *      default:
+ *              ACCESS3resfail  resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_accessres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_uint32(xdr, &result->access);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode a CREATE response
+ * 3.3.5  READLINK3res
+ *
+ *      struct READLINK3resok {
+ *              post_op_attr    symlink_attributes;
+ *              nfspath3        data;
+ *      };
+ *
+ *      struct READLINK3resfail {
+ *              post_op_attr    symlink_attributes;
+ *      };
+ *
+ *      union READLINK3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READLINK3resok  resok;
+ *      default:
+ *              READLINK3resfail resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
-nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+                                     struct xdr_stream *xdr,
-{
+                                     struct nfs_fattr *result)
-        int     status;
+{
+        enum nfs_stat status;
-        status = ntohl(*p++);
+        int error;
-        if (status == 0) {
-                if (*p++) {
+        error = decode_nfsstat3(xdr, &status);
-                        if (!(p = xdr_decode_fhandle(p, res->fh)))
+        if (unlikely(error))
-                                return -errno_NFSERR_IO;
+                goto out;
-                        p = xdr_decode_post_op_attr(p, res->fattr);
+        error = decode_post_op_attr(xdr, result);
-                } else {
+        if (unlikely(error))
-                        memset(res->fh, 0, sizeof(*res->fh));
+                goto out;
-                        /* Do decode post_op_attr but set it to NULL */
+        if (status != NFS3_OK)
-                        p = xdr_decode_post_op_attr(p, res->fattr);
+                goto out_default;
-                        res->fattr->valid = 0;
+        error = decode_nfspath3(xdr);
-                }
+out:
-        } else {
+        return error;
-                status = nfs_stat_to_errno(status);
+out_default:
-        }
+        return nfs_stat_to_errno(status);
-        p = xdr_decode_wcc_data(p, res->dir_attr);
-        return status;
 }
 /*
- * Decode RENAME reply
+ * 3.3.6  READ3res
+ *
+ *      struct READ3resok {
+ *              post_op_attr    file_attributes;
+ *              count3          count;
+ *              bool            eof;
+ *              opaque          data<>;
+ *      };
+ *
+ *      struct READ3resfail {
+ *              post_op_attr    file_attributes;
+ *      };
+ *
+ *      union READ3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READ3resok      resok;
+ *      default:
+ *              READ3resfail    resfail;
+ *      };
 */
-static int
+static int decode_read3resok(struct xdr_stream *xdr,
-nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res)
+                             struct nfs_readres *result)
 {
-        int     status;
+        u32 eof, count, ocount, recvd;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4 + 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p++);
+        eof = be32_to_cpup(p++);
+        ocount = be32_to_cpup(p++);
+        if (unlikely(ocount != count))
+                goto out_mismatch;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, count);
+        result->eof = eof;
+        result->count = count;
+        return count;
+out_mismatch:
+        dprintk("NFS: READ count doesn't match length of opaque: "
+                "count %u != ocount %u\n", count, ocount);
+        return -EIO;
+out_cheating:
+        dprintk("NFS: server cheating in read result: "
+                "count %u > recvd %u\n", count, recvd);
+        count = recvd;
+        eof = 0;
+        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-        if ((status = ntohl(*p++)) != 0)
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-                status = nfs_stat_to_errno(status);
+                                 struct nfs_readres *result)
-        p = xdr_decode_wcc_data(p, res->fromattr);
+{
-        p = xdr_decode_wcc_data(p, res->toattr);
+        enum nfs_stat status;
-        return status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_read3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode LINK reply
+ * 3.3.7  WRITE3res
+ *
+ *      enum stable_how {
+ *              UNSTABLE  = 0,
+ *              DATA_SYNC = 1,
+ *              FILE_SYNC = 2
+ *      };
+ *
+ *      struct WRITE3resok {
+ *              wcc_data        file_wcc;
+ *              count3          count;
+ *              stable_how      committed;
+ *              writeverf3      verf;
+ *      };
+ *
+ *      struct WRITE3resfail {
+ *              wcc_data        file_wcc;
+ *      };
+ *
+ *      union WRITE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              WRITE3resok     resok;
+ *      default:
+ *              WRITE3resfail   resfail;
+ *      };
 */
-static int
+static int decode_write3resok(struct xdr_stream *xdr,
-nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
+                              struct nfs_writeres *result)
 {
-        int     status;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        result->count = be32_to_cpup(p++);
+        result->verf->committed = be32_to_cpup(p++);
+        if (unlikely(result->verf->committed > NFS_FILE_SYNC))
+                goto out_badvalue;
+        memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
+        return result->count;
+out_badvalue:
+        dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-        if ((status = ntohl(*p++)) != 0)
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-                status = nfs_stat_to_errno(status);
+                                  struct nfs_writeres *result)
-        p = xdr_decode_post_op_attr(p, res->fattr);
+{
-        p = xdr_decode_wcc_data(p, res->dir_attr);
+        enum nfs_stat status;
-        return status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_write3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode FSSTAT reply
+ * 3.3.8  CREATE3res
+ *
+ *      struct CREATE3resok {
+ *              post_op_fh3     obj;
+ *              post_op_attr    obj_attributes;
+ *              wcc_data        dir_wcc;
+ *      };
+ *
+ *      struct CREATE3resfail {
+ *              wcc_data        dir_wcc;
+ *      };
+ *
+ *      union CREATE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              CREATE3resok    resok;
+ *      default:
+ *              CREATE3resfail  resfail;
+ *      };
 */
-static int
+static int decode_create3resok(struct xdr_stream *xdr,
-nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
+                               struct nfs3_diropres *result)
 {
-        int             status;
+        int error;
-        status = ntohl(*p++);
+        error = decode_post_op_fh3(xdr, result->fh);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        /* The server isn't required to return a file handle.
+         * If it didn't, force the client to perform a LOOKUP
+         * to determine the correct file handle and attribute
+         * values for the new object. */
+        if (result->fh->size == 0)
+                result->fattr->valid = 0;
+        error = decode_wcc_data(xdr, result->dir_attr);
+out:
+        return error;
+}
-        p = xdr_decode_post_op_attr(p, res->fattr);
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
-        if (status != 0)
+                                   struct xdr_stream *xdr,
-                return nfs_stat_to_errno(status);
+                                   struct nfs3_diropres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_create3resok(xdr, result);
+out:
+        return error;
+out_default:
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
+}
-        p = xdr_decode_hyper(p, &res->tbytes);
+/*
-        p = xdr_decode_hyper(p, &res->fbytes);
+ * 3.3.12  REMOVE3res
-        p = xdr_decode_hyper(p, &res->abytes);
+ *
-        p = xdr_decode_hyper(p, &res->tfiles);
+ *      struct REMOVE3resok {
-        p = xdr_decode_hyper(p, &res->ffiles);
+ *              wcc_data    dir_wcc;
-        p = xdr_decode_hyper(p, &res->afiles);
+ *      };
+ *
+ *      struct REMOVE3resfail {
+ *              wcc_data    dir_wcc;
+ *      };
+ *
+ *      union REMOVE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              REMOVE3resok   resok;
+ *      default:
+ *              REMOVE3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_removeres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        /* ignore invarsec */
+/*
-        return 0;
+ * 3.3.14  RENAME3res
+ *
+ *      struct RENAME3resok {
+ *              wcc_data        fromdir_wcc;
+ *              wcc_data        todir_wcc;
+ *      };
+ *
+ *      struct RENAME3resfail {
+ *              wcc_data        fromdir_wcc;
+ *              wcc_data        todir_wcc;
+ *      };
+ *
+ *      union RENAME3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              RENAME3resok   resok;
+ *      default:
+ *              RENAME3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_renameres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->old_fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->new_fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode FSINFO reply
+ * 3.3.15  LINK3res
+ *
+ *      struct LINK3resok {
+ *              post_op_attr    file_attributes;
+ *              wcc_data        linkdir_wcc;
+ *      };
+ *
+ *      struct LINK3resfail {
+ *              post_op_attr    file_attributes;
+ *              wcc_data        linkdir_wcc;
+ *      };
+ *
+ *      union LINK3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              LINK3resok      resok;
+ *      default:
+ *              LINK3resfail    resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
+                                 struct nfs3_linkres *result)
 {
-        int             status;
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
+/**
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
+ *                      the local page cache
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16  entry3
+ *
+ *      struct entry3 {
+ *              fileid3         fileid;
+ *              filename3       name;
+ *              cookie3         cookie;
+ *              fhandle3        filehandle;
+ *              post_op_attr3   attributes;
+ *              entry3          *nextentry;
+ *      };
+ *
+ * 3.3.17  entryplus3
+ *      struct entryplus3 {
+ *              fileid3         fileid;
+ *              filename3       name;
+ *              cookie3         cookie;
+ *              post_op_attr    name_attributes;
+ *              post_op_fh3     name_handle;
+ *              entryplus3      *nextentry;
+ *      };
+ */
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
+{
+        struct nfs_entry old = *entry;
+        __be32 *p;
+        int error;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p == xdr_zero) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(p == NULL))
+                        goto out_overflow;
+                if (*p == xdr_zero)
+                        return -EAGAIN;
+                entry->eof = 1;
+                return -EBADCOOKIE;
+        }
-        status = ntohl(*p++);
+        error = decode_fileid3(xdr, &entry->ino);
+        if (unlikely(error))
+                return error;
-        p = xdr_decode_post_op_attr(p, res->fattr);
+        error = decode_inline_filename3(xdr, &entry->name, &entry->len);
-        if (status != 0)
+        if (unlikely(error))
-                return nfs_stat_to_errno(status);
+                return error;
-        res->rtmax  = ntohl(*p++);
+        entry->prev_cookie = entry->cookie;
-        res->rtpref = ntohl(*p++);
+        error = decode_cookie3(xdr, &entry->cookie);
-        res->rtmult = ntohl(*p++);
+        if (unlikely(error))
-        res->wtmax  = ntohl(*p++);
+                return error;
-        res->wtpref = ntohl(*p++);
-        res->wtmult = ntohl(*p++);
+        entry->d_type = DT_UNKNOWN;
-        res->dtpref = ntohl(*p++);
-        p = xdr_decode_hyper(p, &res->maxfilesize);
+        if (plus) {
+                entry->fattr->valid = 0;
+                error = decode_post_op_attr(xdr, entry->fattr);
+                if (unlikely(error))
+                        return error;
+                if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+                        entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+                /* In fact, a post_op_fh3: */
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(p == NULL))
+                        goto out_overflow;
+                if (*p != xdr_zero) {
+                        error = decode_nfs_fh3(xdr, entry->fh);
+                        if (unlikely(error)) {
+                                if (error == -E2BIG)
+                                        goto out_truncated;
+                                return error;
+                        }
+                } else
+                        zero_nfs_fh3(entry->fh);
+        }
-        /* ignore time_delta and properties */
-        res->lease_time = 0;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EAGAIN;
+out_truncated:
+        dprintk("NFS: directory entry contains invalid file handle\n");
+        *entry = old;
+        return -EAGAIN;
 }
 /*
- * Decode PATHCONF reply
+ * 3.3.16  READDIR3res
+ *
+ *      struct dirlist3 {
+ *              entry3          *entries;
+ *              bool            eof;
+ *      };
+ *
+ *      struct READDIR3resok {
+ *              post_op_attr    dir_attributes;
+ *              cookieverf3     cookieverf;
+ *              dirlist3        reply;
+ *      };
+ *
+ *      struct READDIR3resfail {
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      union READDIR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READDIR3resok   resok;
+ *      default:
+ *              READDIR3resfail resfail;
+ *      };
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them.  The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
 */
-static int
+static int decode_dirlist3(struct xdr_stream *xdr)
-nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
 {
-        int             status;
+        u32 recvd, pglen;
+        size_t hdrlen;
-        status = ntohl(*p++);
+        pglen = xdr->buf->page_len;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(pglen > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, pglen);
+        return pglen;
+out_cheating:
+        dprintk("NFS: server cheating in readdir result: "
+                "pglen %u > recvd %u\n", pglen, recvd);
+        pglen = recvd;
+        goto out;
+}
-        p = xdr_decode_post_op_attr(p, res->fattr);
+static int decode_readdir3resok(struct xdr_stream *xdr,
-        if (status != 0)
+                                struct nfs3_readdirres *result)
-                return nfs_stat_to_errno(status);
+{
-        res->max_link = ntohl(*p++);
+        int error;
-        res->max_namelen = ntohl(*p++);
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        /* XXX: do we need to check if result->verf != NULL ? */
+        error = decode_cookieverf3(xdr, result->verf);
+        if (unlikely(error))
+                goto out;
+        error = decode_dirlist3(xdr);
+out:
+        return error;
+}
-        /* ignore remaining fields */
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
-        return 0;
+                                    struct xdr_stream *xdr,
+                                    struct nfs3_readdirres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_readdir3resok(xdr, result);
+out:
+        return error;
+out_default:
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode COMMIT reply
+ * 3.3.18  FSSTAT3res
+ *
+ *      struct FSSTAT3resok {
+ *              post_op_attr    obj_attributes;
+ *              size3           tbytes;
+ *              size3           fbytes;
+ *              size3           abytes;
+ *              size3           tfiles;
+ *              size3           ffiles;
+ *              size3           afiles;
+ *              uint32          invarsec;
+ *      };
+ *
+ *      struct FSSTAT3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union FSSTAT3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              FSSTAT3resok    resok;
+ *      default:
+ *              FSSTAT3resfail  resfail;
+ *      };
 */
-static int
+static int decode_fsstat3resok(struct xdr_stream *xdr,
-nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
+                               struct nfs_fsstat *result)
 {
-        int             status;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 8 * 6 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        p = xdr_decode_size3(p, &result->tbytes);
+        p = xdr_decode_size3(p, &result->fbytes);
+        p = xdr_decode_size3(p, &result->abytes);
+        p = xdr_decode_size3(p, &result->tfiles);
+        p = xdr_decode_size3(p, &result->ffiles);
+        xdr_decode_size3(p, &result->afiles);
+        /* ignore invarsec */
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-        status = ntohl(*p++);
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
-        p = xdr_decode_wcc_data(p, res->fattr);
+                                   struct xdr_stream *xdr,
-        if (status != 0)
+                                   struct nfs_fsstat *result)
-                return nfs_stat_to_errno(status);
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_fsstat3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        res->verf->verifier[0] = *p++;
+/*
-        res->verf->verifier[1] = *p++;
+ * 3.3.19  FSINFO3res
+ *
+ *      struct FSINFO3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          rtmax;
+ *              uint32          rtpref;
+ *              uint32          rtmult;
+ *              uint32          wtmax;
+ *              uint32          wtpref;
+ *              uint32          wtmult;
+ *              uint32          dtpref;
+ *              size3           maxfilesize;
+ *              nfstime3        time_delta;
+ *              uint32          properties;
+ *      };
+ *
+ *      struct FSINFO3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union FSINFO3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              FSINFO3resok    resok;
+ *      default:
+ *              FSINFO3resfail  resfail;
+ *      };
+ */
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
+                               struct nfs_fsinfo *result)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        result->rtmax  = be32_to_cpup(p++);
+        result->rtpref = be32_to_cpup(p++);
+        result->rtmult = be32_to_cpup(p++);
+        result->wtmax  = be32_to_cpup(p++);
+        result->wtpref = be32_to_cpup(p++);
+        result->wtmult = be32_to_cpup(p++);
+        result->dtpref = be32_to_cpup(p++);
+        p = xdr_decode_size3(p, &result->maxfilesize);
+        xdr_decode_nfstime3(p, &result->time_delta);
+        /* ignore properties */
+        result->lease_time = 0;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_fsinfo *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_fsinfo3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
-#ifdef CONFIG_NFS_V3_ACL
 /*
- * Decode GETACL reply
+ * 3.3.20  PATHCONF3res
+ *
+ *      struct PATHCONF3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          linkmax;
+ *              uint32          name_max;
+ *              bool            no_trunc;
+ *              bool            chown_restricted;
+ *              bool            case_insensitive;
+ *              bool            case_preserving;
+ *      };
+ *
+ *      struct PATHCONF3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union PATHCONF3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              PATHCONF3resok  resok;
+ *      default:
+ *              PATHCONF3resfail resfail;
+ *      };
 */
-static int
+static int decode_pathconf3resok(struct xdr_stream *xdr,
-nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
+                                 struct nfs_pathconf *result)
-                   struct nfs3_getaclres *res)
 {
-        struct xdr_buf *buf = &req->rq_rcv_buf;
+        __be32 *p;
-        int status = ntohl(*p++);
-        struct posix_acl **acl;
-        unsigned int *aclcnt;
-        int err, base;
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        res->mask = ntohl(*p++);
-        if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
-                return -EINVAL;
-        base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base;
-        acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL;
+        p = xdr_inline_decode(xdr, 4 * 6);
-        aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL;
+        if (unlikely(p == NULL))
-        err = nfsacl_decode(buf, base, aclcnt, acl);
+                goto out_overflow;
+        result->max_link = be32_to_cpup(p++);
+        result->max_namelen = be32_to_cpup(p);
+        /* ignore remaining fields */
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-        acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL;
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
-        aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL;
+                                     struct xdr_stream *xdr,
-        if (err > 0)
+                                     struct nfs_pathconf *result)
-                err = nfsacl_decode(buf, base + err, aclcnt, acl);
+{
-        return (err > 0) ? 0 : err;
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_pathconf3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode setacl reply.
+ * 3.3.21  COMMIT3res
+ *
+ *      struct COMMIT3resok {
+ *              wcc_data        file_wcc;
+ *              writeverf3      verf;
+ *      };
+ *
+ *      struct COMMIT3resfail {
+ *              wcc_data        file_wcc;
+ *      };
+ *
+ *      union COMMIT3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              COMMIT3resok    resok;
+ *      default:
+ *              COMMIT3resfail  resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
-nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                   struct xdr_stream *xdr,
+                                   struct nfs_writeres *result)
 {
-        int status = ntohl(*p++);
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_writeverf3(xdr, result->verf->verifier);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        if (status)
+#ifdef CONFIG_NFS_V3_ACL
-                return nfs_stat_to_errno(status);
-        xdr_decode_post_op_attr(p, fattr);
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
-        return 0;
+                                      struct nfs3_getaclres *result)
+{
+        struct posix_acl **acl;
+        unsigned int *aclcnt;
+        size_t hdrlen;
+        int error;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_uint32(xdr, &result->mask);
+        if (unlikely(error))
+                goto out;
+        error = -EINVAL;
+        if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+                goto out;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        acl = NULL;
+        if (result->mask & NFS_ACL)
+                acl = &result->acl_access;
+        aclcnt = NULL;
+        if (result->mask & NFS_ACLCNT)
+                aclcnt = &result->acl_access_count;
+        error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+        if (unlikely(error <= 0))
+                goto out;
+        acl = NULL;
+        if (result->mask & NFS_DFACL)
+                acl = &result->acl_default;
+        aclcnt = NULL;
+        if (result->mask & NFS_DFACLCNT)
+                aclcnt = &result->acl_default_count;
+        error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+        if (unlikely(error <= 0))
+                return error;
+        error = 0;
+out:
+        return error;
+}
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_getaclres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_getacl3resok(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_fattr *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_post_op_attr(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
 #endif  /* CONFIG_NFS_V3_ACL */
 #define PROC(proc, argtype, restype, timer)                             \
 [NFS3PROC_##proc] = {                                                   \
        .p_proc      = NFS3PROC_##proc,                                 \
-        .p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,                 \
+        .p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,      \
-        .p_decode    = (kxdrproc_t) nfs3_xdr_##restype,                 \
+        .p_decode    = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res,       \
-        .p_arglen    = NFS3_##argtype##_sz,                             \
+        .p_arglen    = NFS3_##argtype##args_sz,                         \
-        .p_replen    = NFS3_##restype##_sz,                             \
+        .p_replen    = NFS3_##restype##res_sz,                          \
        .p_timer     = timer,                                           \
        .p_statidx   = NFS3PROC_##proc,                                 \
        .p_name      = #proc,                                           \
        }
 struct rpc_procinfo     nfs3_procedures[] = {
-  PROC(GETATTR,         fhandle,        attrstat, 1),
+        PROC(GETATTR,           getattr,        getattr,        1),
-  PROC(SETATTR,         sattrargs,      wccstat, 0),
+        PROC(SETATTR,           setattr,        setattr,        0),
-  PROC(LOOKUP,          diropargs,      lookupres, 2),
+        PROC(LOOKUP,            lookup,         lookup,         2),
-  PROC(ACCESS,          accessargs,     accessres, 1),
+        PROC(ACCESS,            access,         access,         1),
-  PROC(READLINK,        readlinkargs,   readlinkres, 3),
+        PROC(READLINK,          readlink,       readlink,       3),
-  PROC(READ,            readargs,       readres, 3),
+        PROC(READ,              read,           read,           3),
-  PROC(WRITE,           writeargs,      writeres, 4),
+        PROC(WRITE,             write,          write,          4),
-  PROC(CREATE,          createargs,     createres, 0),
+        PROC(CREATE,            create,         create,         0),
-  PROC(MKDIR,           mkdirargs,      createres, 0),
+        PROC(MKDIR,             mkdir,          create,         0),
-  PROC(SYMLINK,         symlinkargs,    createres, 0),
+        PROC(SYMLINK,           symlink,        create,         0),
-  PROC(MKNOD,           mknodargs,      createres, 0),
+        PROC(MKNOD,             mknod,          create,         0),
-  PROC(REMOVE,          removeargs,     removeres, 0),
+        PROC(REMOVE,            remove,         remove,         0),
-  PROC(RMDIR,           diropargs,      wccstat, 0),
+        PROC(RMDIR,             lookup,         setattr,        0),
-  PROC(RENAME,          renameargs,     renameres, 0),
+        PROC(RENAME,            rename,         rename,         0),
-  PROC(LINK,            linkargs,       linkres, 0),
+        PROC(LINK,              link,           link,           0),
-  PROC(READDIR,         readdirargs,    readdirres, 3),
+        PROC(READDIR,           readdir,        readdir,        3),
-  PROC(READDIRPLUS,     readdirargs,    readdirres, 3),
+        PROC(READDIRPLUS,       readdirplus,    readdir,        3),
-  PROC(FSSTAT,          fhandle,        fsstatres, 0),
+        PROC(FSSTAT,            getattr,        fsstat,         0),
-  PROC(FSINFO,          fhandle,        fsinfores, 0),
+        PROC(FSINFO,            getattr,        fsinfo,         0),
-  PROC(PATHCONF,        fhandle,        pathconfres, 0),
+        PROC(PATHCONF,          getattr,        pathconf,       0),
-  PROC(COMMIT,          commitargs,     commitres, 5),
+        PROC(COMMIT,            commit,         commit,         5),
 };
 struct rpc_version              nfs_version3 = {
@@ -1181,8 +2471,8 @@ struct rpc_version		nfs_version3 = {
 static struct rpc_procinfo      nfs3_acl_procedures[] = {
        [ACLPROC3_GETACL] = {
                .p_proc = ACLPROC3_GETACL,
-                .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
+                .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
-                .p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
+                .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
                .p_arglen = ACL3_getaclargs_sz,
                .p_replen = ACL3_getaclres_sz,
                .p_timer = 1,
@@ -1190,8 +2480,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
        },
        [ACLPROC3_SETACL] = {
                .p_proc = ACLPROC3_SETACL,
-                .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
+                .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
-                .p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
+                .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
                .p_arglen = ACL3_setaclargs_sz,
                .p_replen = ACL3_setaclres_sz,
                .p_timer = 0,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 311e15cc8af0..7a7474073148 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_REBOOT,
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
+        NFS4CLNT_LAYOUTRECALL,
        NFS4CLNT_SESSION_RESET,
        NFS4CLNT_RECALL_SLOT,
 };
@@ -109,7 +110,7 @@ struct nfs_unique_id {
 struct nfs4_state_owner {
        struct nfs_unique_id so_owner_id;
        struct nfs_server    *so_server;
-        struct rb_node       so_client_node;
+        struct rb_node       so_server_node;
        struct rpc_cred      *so_cred;   /* Associated cred */
@@ -227,12 +228,6 @@ struct nfs4_state_maintenance_ops {
 extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
-/* inode.c */
-extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
-extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
-extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
@@ -241,13 +236,12 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
-extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
-extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
 extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
+extern const struct xattr_handler *nfs4_xattr_handlers[];
 #if defined(CONFIG_NFS_V4_1)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -333,7 +327,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 000000000000..23f930caf1e2
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,280 @@
+/*
+ *  Module for the pnfs nfs4 file layout driver.
+ *  Defines all I/O and Policy interface operations, plus code
+ *  to register itself with the pNFS client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "nfs4filelayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+static int
+filelayout_set_layoutdriver(struct nfs_server *nfss)
+{
+        int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+                                                nfs4_fl_free_deviceid_callback);
+        if (status) {
+                printk(KERN_WARNING "%s: deviceid cache could not be "
+                        "initialized\n", __func__);
+                return status;
+        }
+        dprintk("%s: deviceid cache has been initialized successfully\n",
+                __func__);
+        return 0;
+}
+/* Clear out the layout by destroying its device list */
+static int
+filelayout_clear_layoutdriver(struct nfs_server *nfss)
+{
+        dprintk("--> %s\n", __func__);
+        if (nfss->nfs_client->cl_devid_cache)
+                pnfs_put_deviceid_cache(nfss->nfs_client);
+        return 0;
+}
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+                        struct nfs4_filelayout_segment *fl,
+                        struct nfs4_layoutget_res *lgr,
+                        struct nfs4_deviceid *id)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr;
+        int status = -EINVAL;
+        struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
+        dprintk("--> %s\n", __func__);
+        if (fl->pattern_offset > lgr->range.offset) {
+                dprintk("%s pattern_offset %lld to large\n",
+                                __func__, fl->pattern_offset);
+                goto out;
+        }
+        if (fl->stripe_unit % PAGE_SIZE) {
+                dprintk("%s Stripe unit (%u) not page aligned\n",
+                        __func__, fl->stripe_unit);
+                goto out;
+        }
+        /* find and reference the deviceid */
+        dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+        if (dsaddr == NULL) {
+                dsaddr = get_device_info(lo->plh_inode, id);
+                if (dsaddr == NULL)
+                        goto out;
+        }
+        fl->dsaddr = dsaddr;
+        if (fl->first_stripe_index < 0 ||
+            fl->first_stripe_index >= dsaddr->stripe_count) {
+                dprintk("%s Bad first_stripe_index %d\n",
+                                __func__, fl->first_stripe_index);
+                goto out_put;
+        }
+        if ((fl->stripe_type == STRIPE_SPARSE &&
+            fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+            (fl->stripe_type == STRIPE_DENSE &&
+            fl->num_fh != dsaddr->stripe_count)) {
+                dprintk("%s num_fh %u not valid for given packing\n",
+                        __func__, fl->num_fh);
+                goto out_put;
+        }
+        if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
+                dprintk("%s Stripe unit (%u) not aligned with rsize %u "
+                        "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
+                        nfss->wsize);
+        }
+        status = 0;
+out:
+        dprintk("--> %s returns %d\n", __func__, status);
+        return status;
+out_put:
+        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+        goto out;
+}
+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+{
+        int i;
+        for (i = 0; i < fl->num_fh; i++) {
+                if (!fl->fh_array[i])
+                        break;
+                kfree(fl->fh_array[i]);
+        }
+        kfree(fl->fh_array);
+        fl->fh_array = NULL;
+}
+static void
+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+        filelayout_free_fh_array(fl);
+        kfree(fl);
+}
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+                         struct nfs4_filelayout_segment *fl,
+                         struct nfs4_layoutget_res *lgr,
+                         struct nfs4_deviceid *id)
+{
+        uint32_t *p = (uint32_t *)lgr->layout.buf;
+        uint32_t nfl_util;
+        int i;
+        dprintk("%s: set_layout_map Begin\n", __func__);
+        memcpy(id, p, sizeof(*id));
+        p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+        print_deviceid(id);
+        nfl_util = be32_to_cpup(p++);
+        if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+                fl->commit_through_mds = 1;
+        if (nfl_util & NFL4_UFLG_DENSE)
+                fl->stripe_type = STRIPE_DENSE;
+        else
+                fl->stripe_type = STRIPE_SPARSE;
+        fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+        fl->first_stripe_index = be32_to_cpup(p++);
+        p = xdr_decode_hyper(p, &fl->pattern_offset);
+        fl->num_fh = be32_to_cpup(p++);
+        dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+                __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+                fl->pattern_offset);
+        fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
+                               GFP_KERNEL);
+        if (!fl->fh_array)
+                return -ENOMEM;
+        for (i = 0; i < fl->num_fh; i++) {
+                /* Do we want to use a mempool here? */
+                fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
+                if (!fl->fh_array[i]) {
+                        filelayout_free_fh_array(fl);
+                        return -ENOMEM;
+                }
+                fl->fh_array[i]->size = be32_to_cpup(p++);
+                if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+                        printk(KERN_ERR "Too big fh %d received %d\n",
+                               i, fl->fh_array[i]->size);
+                        filelayout_free_fh_array(fl);
+                        return -EIO;
+                }
+                memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+                p += XDR_QUADLEN(fl->fh_array[i]->size);
+                dprintk("DEBUG: %s: fh len %d\n", __func__,
+                        fl->fh_array[i]->size);
+        }
+        return 0;
+}
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+                      struct nfs4_layoutget_res *lgr)
+{
+        struct nfs4_filelayout_segment *fl;
+        int rc;
+        struct nfs4_deviceid id;
+        dprintk("--> %s\n", __func__);
+        fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+        if (!fl)
+                return NULL;
+        rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
+        if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
+                _filelayout_free_lseg(fl);
+                return NULL;
+        }
+        return &fl->generic_hdr;
+}
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+        struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
+        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+        dprintk("--> %s\n", __func__);
+        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
+                          &fl->dsaddr->deviceid);
+        _filelayout_free_lseg(fl);
+}
+static struct pnfs_layoutdriver_type filelayout_type = {
+        .id = LAYOUT_NFSV4_1_FILES,
+        .name = "LAYOUT_NFSV4_1_FILES",
+        .owner = THIS_MODULE,
+        .set_layoutdriver = filelayout_set_layoutdriver,
+        .clear_layoutdriver = filelayout_clear_layoutdriver,
+        .alloc_lseg              = filelayout_alloc_lseg,
+        .free_lseg               = filelayout_free_lseg,
+};
+static int __init nfs4filelayout_init(void)
+{
+        printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+               __func__);
+        return pnfs_register_layoutdriver(&filelayout_type);
+}
+static void __exit nfs4filelayout_exit(void)
+{
+        printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+               __func__);
+        pnfs_unregister_layoutdriver(&filelayout_type);
+}
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 000000000000..bbf60dd2ab9d
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,94 @@
+/*
+ *  NFSv4 file layout driver data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+#include "pnfs.h"
+/*
+ * Field testing shows we need to support upto 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
+enum stripetype4 {
+        STRIPE_SPARSE = 1,
+        STRIPE_DENSE = 2
+};
+/* Individual ip address */
+struct nfs4_pnfs_ds {
+        struct list_head        ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+        u32                     ds_ip_addr;
+        u32                     ds_port;
+        struct nfs_client       *ds_clp;
+        atomic_t                ds_count;
+};
+struct nfs4_file_layout_dsaddr {
+        struct pnfs_deviceid_node       deviceid;
+        u32                             stripe_count;
+        u8                              *stripe_indices;
+        u32                             ds_num;
+        struct nfs4_pnfs_ds             *ds_list[1];
+};
+struct nfs4_filelayout_segment {
+        struct pnfs_layout_segment generic_hdr;
+        u32 stripe_type;
+        u32 commit_through_mds;
+        u32 stripe_unit;
+        u32 first_stripe_index;
+        u64 pattern_offset;
+        struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+        unsigned int num_fh;
+        struct nfs_fh **fh_array;
+};
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+        return container_of(lseg,
+                            struct nfs4_filelayout_segment,
+                            generic_hdr);
+}
+extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern void print_ds(struct nfs4_pnfs_ds *ds);
+extern void print_deviceid(struct nfs4_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 000000000000..f5c9b125e8cc
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,453 @@
+/*
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+#include "nfs4filelayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ *   - set to 1 on allocation
+ *   - incremented when a device id maps a data server already in the cache.
+ *   - decremented when deviceid is removed from the cache.
+ */
+DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+/* Debug routines */
+void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+        if (ds == NULL) {
+                printk("%s NULL device\n", __func__);
+                return;
+        }
+        printk("        ip_addr %x port %hu\n"
+                "        ref count %d\n"
+                "        client %p\n"
+                "        cl_exchange_flags %x\n",
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+                atomic_read(&ds->ds_count), ds->ds_clp,
+                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+void
+print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+        int i;
+        ifdebug(FACILITY) {
+                printk("%s dsaddr->ds_num %d\n", __func__,
+                       dsaddr->ds_num);
+                for (i = 0; i < dsaddr->ds_num; i++)
+                        print_ds(dsaddr->ds_list[i]);
+        }
+}
+void print_deviceid(struct nfs4_deviceid *id)
+{
+        u32 *p = (u32 *)id;
+        dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+                p[0], p[1], p[2], p[3]);
+}
+/* nfs4_ds_cache_lock is held */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(u32 ip_addr, u32 port)
+{
+        struct nfs4_pnfs_ds *ds;
+        dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
+                        ntohl(ip_addr), ntohs(port));
+        list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+                if (ds->ds_ip_addr == ip_addr &&
+                    ds->ds_port == port) {
+                        return ds;
+                }
+        }
+        return NULL;
+}
+static void
+destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+        dprintk("--> %s\n", __func__);
+        ifdebug(FACILITY)
+                print_ds(ds);
+        if (ds->ds_clp)
+                nfs_put_client(ds->ds_clp);
+        kfree(ds);
+}
+static void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+        struct nfs4_pnfs_ds *ds;
+        int i;
+        print_deviceid(&dsaddr->deviceid.de_id);
+        for (i = 0; i < dsaddr->ds_num; i++) {
+                ds = dsaddr->ds_list[i];
+                if (ds != NULL) {
+                        if (atomic_dec_and_lock(&ds->ds_count,
+                                                &nfs4_ds_cache_lock)) {
+                                list_del_init(&ds->ds_node);
+                                spin_unlock(&nfs4_ds_cache_lock);
+                                destroy_ds(ds);
+                        }
+                }
+        }
+        kfree(dsaddr->stripe_indices);
+        kfree(dsaddr);
+}
+void
+nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr =
+                container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
+        nfs4_fl_free_deviceid(dsaddr);
+}
+static struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
+{
+        struct nfs4_pnfs_ds *tmp_ds, *ds;
+        ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
+        if (!ds)
+                goto out;
+        spin_lock(&nfs4_ds_cache_lock);
+        tmp_ds = _data_server_lookup_locked(ip_addr, port);
+        if (tmp_ds == NULL) {
+                ds->ds_ip_addr = ip_addr;
+                ds->ds_port = port;
+                atomic_set(&ds->ds_count, 1);
+                INIT_LIST_HEAD(&ds->ds_node);
+                ds->ds_clp = NULL;
+                list_add(&ds->ds_node, &nfs4_data_server_cache);
+                dprintk("%s add new data server ip 0x%x\n", __func__,
+                        ds->ds_ip_addr);
+        } else {
+                kfree(ds);
+                atomic_inc(&tmp_ds->ds_count);
+                dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
+                        __func__, tmp_ds->ds_ip_addr,
+                        atomic_read(&tmp_ds->ds_count));
+                ds = tmp_ds;
+        }
+        spin_unlock(&nfs4_ds_cache_lock);
+out:
+        return ds;
+}
+/*
+ * Currently only support ipv4, and one multi-path address.
+ */
+static struct nfs4_pnfs_ds *
+decode_and_add_ds(__be32 **pp, struct inode *inode)
+{
+        struct nfs4_pnfs_ds *ds = NULL;
+        char *buf;
+        const char *ipend, *pstr;
+        u32 ip_addr, port;
+        int nlen, rlen, i;
+        int tmp[2];
+        __be32 *r_netid, *r_addr, *p = *pp;
+        /* r_netid */
+        nlen = be32_to_cpup(p++);
+        r_netid = p;
+        p += XDR_QUADLEN(nlen);
+        /* r_addr */
+        rlen = be32_to_cpup(p++);
+        r_addr = p;
+        p += XDR_QUADLEN(rlen);
+        *pp = p;
+        /* Check that netid is "tcp" */
+        if (nlen != 3 ||  memcmp((char *)r_netid, "tcp", 3)) {
+                dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
+                goto out_err;
+        }
+        /* ipv6 length plus port is legal */
+        if (rlen > INET6_ADDRSTRLEN + 8) {
+                dprintk("%s: Invalid address, length %d\n", __func__,
+                        rlen);
+                goto out_err;
+        }
+        buf = kmalloc(rlen + 1, GFP_KERNEL);
+        buf[rlen] = '\0';
+        memcpy(buf, r_addr, rlen);
+        /* replace the port dots with dashes for the in4_pton() delimiter*/
+        for (i = 0; i < 2; i++) {
+                char *res = strrchr(buf, '.');
+                if (!res) {
+                        dprintk("%s: Failed finding expected dots in port\n",
+                                __func__);
+                        goto out_free;
+                }
+                *res = '-';
+        }
+        /* Currently only support ipv4 address */
+        if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
+                dprintk("%s: Only ipv4 addresses supported\n", __func__);
+                goto out_free;
+        }
+        /* port */
+        pstr = ipend;
+        sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
+        port = htons((tmp[0] << 8) | (tmp[1]));
+        ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
+        dprintk("%s: Decoded address and port %s\n", __func__, buf);
+out_free:
+        kfree(buf);
+out_err:
+        return ds;
+}
+/* Decode opaque device data and return the result */
+static struct nfs4_file_layout_dsaddr*
+decode_device(struct inode *ino, struct pnfs_device *pdev)
+{
+        int i, dummy;
+        u32 cnt, num;
+        u8 *indexp;
+        __be32 *p = (__be32 *)pdev->area, *indicesp;
+        struct nfs4_file_layout_dsaddr *dsaddr;
+        /* Get the stripe count (number of stripe index) */
+        cnt = be32_to_cpup(p++);
+        dprintk("%s stripe count  %d\n", __func__, cnt);
+        if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+                printk(KERN_WARNING "%s: stripe count %d greater than "
+                       "supported maximum %d\n", __func__,
+                        cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+                goto out_err;
+        }
+        /* Check the multipath list count */
+        indicesp = p;
+        p += XDR_QUADLEN(cnt << 2);
+        num = be32_to_cpup(p++);
+        dprintk("%s ds_num %u\n", __func__, num);
+        if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+                printk(KERN_WARNING "%s: multipath count %d greater than "
+                        "supported maximum %d\n", __func__,
+                        num, NFS4_PNFS_MAX_MULTI_CNT);
+                goto out_err;
+        }
+        dsaddr = kzalloc(sizeof(*dsaddr) +
+                        (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+                        GFP_KERNEL);
+        if (!dsaddr)
+                goto out_err;
+        dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
+        if (!dsaddr->stripe_indices)
+                goto out_err_free;
+        dsaddr->stripe_count = cnt;
+        dsaddr->ds_num = num;
+        memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+        /* Go back an read stripe indices */
+        p = indicesp;
+        indexp = &dsaddr->stripe_indices[0];
+        for (i = 0; i < dsaddr->stripe_count; i++) {
+                *indexp = be32_to_cpup(p++);
+                if (*indexp >= num)
+                        goto out_err_free;
+                indexp++;
+        }
+        /* Skip already read multipath list count */
+        p++;
+        for (i = 0; i < dsaddr->ds_num; i++) {
+                int j;
+                dummy = be32_to_cpup(p++); /* multipath count */
+                if (dummy > 1) {
+                        printk(KERN_WARNING
+                               "%s: Multipath count %d not supported, "
+                               "skipping all greater than 1\n", __func__,
+                                dummy);
+                }
+                for (j = 0; j < dummy; j++) {
+                        if (j == 0) {
+                                dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
+                                if (dsaddr->ds_list[i] == NULL)
+                                        goto out_err_free;
+                        } else {
+                                u32 len;
+                                /* skip extra multipath */
+                                len = be32_to_cpup(p++);
+                                p += XDR_QUADLEN(len);
+                                len = be32_to_cpup(p++);
+                                p += XDR_QUADLEN(len);
+                                continue;
+                        }
+                }
+        }
+        return dsaddr;
+out_err_free:
+        nfs4_fl_free_deviceid(dsaddr);
+out_err:
+        dprintk("%s ERROR: returning NULL\n", __func__);
+        return NULL;
+}
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices.
+ * If the deviceid is already cached, nfs4_add_deviceid will return
+ * a pointer to the cached struct and throw away the new.
+ */
+static struct nfs4_file_layout_dsaddr*
+decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr;
+        struct pnfs_deviceid_node *d;
+        dsaddr = decode_device(inode, dev);
+        if (!dsaddr) {
+                printk(KERN_WARNING "%s: Could not decode or add device\n",
+                        __func__);
+                return NULL;
+        }
+        d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+                              &dsaddr->deviceid);
+        return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
+/*
+ * Retrieve the information for dev_id, add it to the list
+ * of available devices, and return it.
+ */
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
+{
+        struct pnfs_device *pdev = NULL;
+        u32 max_resp_sz;
+        int max_pages;
+        struct page **pages = NULL;
+        struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+        int rc, i;
+        struct nfs_server *server = NFS_SERVER(inode);
+        /*
+         * Use the session max response size as the basis for setting
+         * GETDEVICEINFO's maxcount
+         */
+        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        max_pages = max_resp_sz >> PAGE_SHIFT;
+        dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
+                __func__, inode, max_resp_sz, max_pages);
+        pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
+        if (pdev == NULL)
+                return NULL;
+        pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
+        if (pages == NULL) {
+                kfree(pdev);
+                return NULL;
+        }
+        for (i = 0; i < max_pages; i++) {
+                pages[i] = alloc_page(GFP_KERNEL);
+                if (!pages[i])
+                        goto out_free;
+        }
+        /* set pdev->area */
+        pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
+        if (!pdev->area)
+                goto out_free;
+        memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+        pdev->layout_type = LAYOUT_NFSV4_1_FILES;
+        pdev->pages = pages;
+        pdev->pgbase = 0;
+        pdev->pglen = PAGE_SIZE * max_pages;
+        pdev->mincount = 0;
+        rc = nfs4_proc_getdeviceinfo(server, pdev);
+        dprintk("%s getdevice info returns %d\n", __func__, rc);
+        if (rc)
+                goto out_free;
+        /*
+         * Found new device, need to decode it and then add it to the
+         * list of known devices for this mountpoint.
+         */
+        dsaddr = decode_and_add_device(inode, pdev);
+out_free:
+        if (pdev->area != NULL)
+                vunmap(pdev->area);
+        for (i = 0; i < max_pages; i++)
+                __free_page(pages[i]);
+        kfree(pages);
+        kfree(pdev);
+        dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
+        return dsaddr;
+}
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+{
+        struct pnfs_deviceid_node *d;
+        d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
+        return (d == NULL) ? NULL :
+                container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 089da5b5d20a..78936a8f40ab 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -49,12 +49,15 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/xattr.h>
+#include <linux/utsname.h>
 #include "nfs4_fs.h"
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
 #include "callback.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
@@ -129,7 +132,8 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
                        | FATTR4_WORD0_MAXREAD
                        | FATTR4_WORD0_MAXWRITE
                        | FATTR4_WORD0_LEASE_TIME,
-                        0
+                        FATTR4_WORD1_TIME_DELTA
+                        | FATTR4_WORD1_FS_LAYOUT_TYPES
 };
 const u32 nfs4_fs_locations_bitmap[2] = {
@@ -255,9 +259,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                        nfs4_state_mark_reclaim_nograce(clp, state);
                        goto do_state_recovery;
                case -NFS4ERR_STALE_STATEID:
-                        if (state == NULL)
-                                break;
-                        nfs4_state_mark_reclaim_reboot(clp, state);
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
                        goto do_state_recovery;
@@ -334,10 +335,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 * Must be called while holding tbl->slot_tbl_lock
 */
 static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
+nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 {
+        int free_slotid = free_slot - tbl->slots;
        int slotid = free_slotid;
+        BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
        /* clear used bit in bitmap */
        __clear_bit(slotid, tbl->used_slots);
@@ -354,9 +357,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
 }
 /*
- * Signal state manager thread if session is drained
+ * Signal state manager thread if session fore channel is drained
 */
-static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
+static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
 {
        struct rpc_task *task;
@@ -370,8 +373,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
        if (ses->fc_slot_table.highest_used_slotid != -1)
                return;
-        dprintk("%s COMPLETE: Session Drained\n", __func__);
+        dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
-        complete(&ses->complete);
+        complete(&ses->fc_slot_table.complete);
+}
+/*
+ * Signal state manager thread if session back channel is drained
+ */
+void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
+{
+        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
+            ses->bc_slot_table.highest_used_slotid != -1)
+                return;
+        dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
+        complete(&ses->bc_slot_table.complete);
 }
 static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -379,7 +394,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        struct nfs4_slot_table *tbl;
        tbl = &res->sr_session->fc_slot_table;
-        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
+        if (!res->sr_slot) {
                /* just wake up the next guy waiting since
                 * we may have not consumed a slot after all */
                dprintk("%s: No slot\n", __func__);
@@ -387,17 +402,15 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        }
        spin_lock(&tbl->slot_tbl_lock);
-        nfs4_free_slot(tbl, res->sr_slotid);
+        nfs4_free_slot(tbl, res->sr_slot);
-        nfs41_check_drain_session_complete(res->sr_session);
+        nfs4_check_drain_fc_complete(res->sr_session);
        spin_unlock(&tbl->slot_tbl_lock);
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        res->sr_slot = NULL;
 }
 static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
        unsigned long timestamp;
-        struct nfs4_slot_table *tbl;
-        struct nfs4_slot *slot;
        struct nfs_client *clp;
        /*
@@ -410,17 +423,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
                res->sr_status = NFS_OK;
        /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
-        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
+        if (!res->sr_slot)
                goto out;
-        tbl = &res->sr_session->fc_slot_table;
-        slot = tbl->slots + res->sr_slotid;
        /* Check the SEQUENCE operation status */
        switch (res->sr_status) {
        case 0:
                /* Update the slot's sequence and clientid lease timer */
-                ++slot->seq_nr;
+                ++res->sr_slot->seq_nr;
                timestamp = res->sr_renewal_time;
                clp = res->sr_session->clp;
                do_renew_lease(clp, timestamp);
@@ -433,12 +443,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
                 * returned NFS4ERR_DELAY as per Section 2.10.6.2
                 * of RFC5661.
                 */
-                dprintk("%s: slot=%d seq=%d: Operation in progress\n",
+                dprintk("%s: slot=%td seq=%d: Operation in progress\n",
-                                __func__, res->sr_slotid, slot->seq_nr);
+                        __func__,
+                        res->sr_slot - res->sr_session->fc_slot_table.slots,
+                        res->sr_slot->seq_nr);
                goto out_retry;
        default:
                /* Just update the slot sequence no. */
-                ++slot->seq_nr;
+                ++res->sr_slot->seq_nr;
        }
 out:
        /* The session may be reset by one of the error handlers. */
@@ -505,10 +517,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        dprintk("--> %s\n", __func__);
        /* slot already allocated? */
-        if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
+        if (res->sr_slot != NULL)
                return 0;
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
        tbl = &session->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
@@ -550,7 +561,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
        res->sr_session = session;
-        res->sr_slotid = slotid;
+        res->sr_slot = slot;
        res->sr_renewal_time = jiffies;
        res->sr_status_flags = 0;
        /*
@@ -576,8 +587,9 @@ int nfs4_setup_sequence(const struct nfs_server *server,
                goto out;
        }
-        dprintk("--> %s clp %p session %p sr_slotid %d\n",
+        dprintk("--> %s clp %p session %p sr_slot %td\n",
-                __func__, session->clp, session, res->sr_slotid);
+                __func__, session->clp, session, res->sr_slot ?
+                        res->sr_slot - session->fc_slot_table.slots : -1);
        ret = nfs41_setup_sequence(session, args, res, cache_reply,
                                   task);
@@ -650,7 +662,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
                .callback_data = &data
        };
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        res->sr_slot = NULL;
        if (privileged)
                task_setup.callback_ops = &nfs41_call_priv_sync_ops;
        task = rpc_run_task(&task_setup);
@@ -735,7 +747,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
        p->o_res.server = p->o_arg.server;
        nfs_fattr_init(&p->f_attr);
        nfs_fattr_init(&p->dir_attr);
-        p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -1120,6 +1131,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
        clear_bit(NFS_DELEGATED_STATE, &state->flags);
        smp_rmb();
        if (state->n_rdwr != 0) {
+                clear_bit(NFS_O_RDWR_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
                if (ret != 0)
                        return ret;
@@ -1127,6 +1139,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
                        return -ESTALE;
        }
        if (state->n_wronly != 0) {
+                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
                if (ret != 0)
                        return ret;
@@ -1134,6 +1147,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
                        return -ESTALE;
        }
        if (state->n_rdonly != 0) {
+                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
                if (ret != 0)
                        return ret;
@@ -1188,7 +1202,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
        int err;
        do {
                err = _nfs4_do_open_reclaim(ctx, state);
-                if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
+                if (err != -NFS4ERR_DELAY)
                        break;
                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
@@ -1258,6 +1272,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                        case -EKEYEXPIRED:
+                                /*
+                                 * User RPCSEC_GSS context has expired.
+                                 * We cannot recover this stateid now, so
+                                 * skip it and allow recovery thread to
+                                 * proceed.
+                                 */
                        case -ENOMEM:
                                err = 0;
                                goto out;
@@ -1605,7 +1626,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
                        goto out;
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
-                case -EKEYEXPIRED:
                        nfs4_handle_exception(server, err, &exception);
                        err = 0;
                }
@@ -1820,6 +1840,8 @@ struct nfs4_closedata {
        struct nfs_closeres res;
        struct nfs_fattr fattr;
        unsigned long timestamp;
+        bool roc;
+        u32 roc_barrier;
 };
 static void nfs4_free_closedata(void *data)
@@ -1827,6 +1849,8 @@ static void nfs4_free_closedata(void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state_owner *sp = calldata->state->owner;
+        if (calldata->roc)
+                pnfs_roc_release(calldata->state->inode);
        nfs4_put_open_state(calldata->state);
        nfs_free_seqid(calldata->arg.seqid);
        nfs4_put_state_owner(sp);
@@ -1859,6 +1883,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
         */
        switch (task->tk_status) {
                case 0:
+                        if (calldata->roc)
+                                pnfs_roc_set_barrier(state->inode,
+                                                     calldata->roc_barrier);
                        nfs_set_open_stateid(state, &calldata->res.stateid, 0);
                        renew_lease(server, calldata->timestamp);
                        nfs4_close_clear_stateid_flags(state,
@@ -1911,8 +1938,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                return;
        }
-        if (calldata->arg.fmode == 0)
+        if (calldata->arg.fmode == 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+                if (calldata->roc &&
+                    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+                        rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+                                     task, NULL);
+                        return;
+                }
+        }
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
@@ -1940,7 +1974,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
 *
 * NOTE: Caller must be holding the sp->so_owner semaphore!
 */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs4_closedata *calldata;
@@ -1975,12 +2009,12 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
-        calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+        calldata->roc = roc;
        path_get(path);
        calldata->path = *path;
-        msg.rpc_argp = &calldata->arg,
+        msg.rpc_argp = &calldata->arg;
-        msg.rpc_resp = &calldata->res,
+        msg.rpc_resp = &calldata->res;
        task_setup_data.callback_data = calldata;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -1993,125 +2027,24 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 out_free_calldata:
        kfree(calldata);
 out:
+        if (roc)
+                pnfs_roc_release(state->inode);
        nfs4_put_open_state(state);
        nfs4_put_state_owner(sp);
        return status;
 }
-static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
+static struct inode *
-{
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
-        struct file *filp;
-        int ret;
-        /* If the open_intent is for execute, we have an extra check to make */
-        if (fmode & FMODE_EXEC) {
-                ret = nfs_may_open(state->inode,
-                                state->owner->so_cred,
-                                nd->intent.open.flags);
-                if (ret < 0)
-                        goto out_close;
-        }
-        filp = lookup_instantiate_filp(nd, path->dentry, NULL);
-        if (!IS_ERR(filp)) {
-                struct nfs_open_context *ctx;
-                ctx = nfs_file_open_context(filp);
-                ctx->state = state;
-                return 0;
-        }
-        ret = PTR_ERR(filp);
-out_close:
-        nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
-        return ret;
-}
-struct dentry *
-nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-        struct path path = {
-                .mnt = nd->path.mnt,
-                .dentry = dentry,
-        };
-        struct dentry *parent;
-        struct iattr attr;
-        struct rpc_cred *cred;
        struct nfs4_state *state;
-        struct dentry *res;
-        int open_flags = nd->intent.open.flags;
-        fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
-        if (nd->flags & LOOKUP_CREATE) {
-                attr.ia_mode = nd->intent.open.create_mode;
-                attr.ia_valid = ATTR_MODE;
-                if (!IS_POSIXACL(dir))
-                        attr.ia_mode &= ~current_umask();
-        } else {
-                open_flags &= ~O_EXCL;
-                attr.ia_valid = 0;
-                BUG_ON(open_flags & O_CREAT);
-        }
-        cred = rpc_lookup_cred();
-        if (IS_ERR(cred))
-                return (struct dentry *)cred;
-        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
-        nfs_block_sillyrename(parent);
+        state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
-        state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred);
+        if (IS_ERR(state))
-        put_rpccred(cred);
+                return ERR_CAST(state);
-        if (IS_ERR(state)) {
+        ctx->state = state;
-                if (PTR_ERR(state) == -ENOENT) {
+        return igrab(state->inode);
-                        d_add(dentry, NULL);
-                        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                }
-                nfs_unblock_sillyrename(parent);
-                return (struct dentry *)state;
-        }
-        res = d_add_unique(dentry, igrab(state->inode));
-        if (res != NULL)
-                path.dentry = res;
-        nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
-        nfs_unblock_sillyrename(parent);
-        nfs4_intent_set_file(nd, &path, state, fmode);
-        return res;
-}
-int
-nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
-{
-        struct path path = {
-                .mnt = nd->path.mnt,
-                .dentry = dentry,
-        };
-        struct rpc_cred *cred;
-        struct nfs4_state *state;
-        fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
-        cred = rpc_lookup_cred();
-        if (IS_ERR(cred))
-                return PTR_ERR(cred);
-        state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
-        put_rpccred(cred);
-        if (IS_ERR(state)) {
-                switch (PTR_ERR(state)) {
-                        case -EPERM:
-                        case -EACCES:
-                        case -EDQUOT:
-                        case -ENOSPC:
-                        case -EROFS:
-                                return PTR_ERR(state);
-                        default:
-                                goto out_drop;
-                }
-        }
-        if (state->inode == dentry->d_inode) {
-                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                nfs4_intent_set_file(nd, &path, state, fmode);
-                return 1;
-        }
-        nfs4_close_sync(&path, state, fmode);
-out_drop:
-        d_drop(dentry);
-        return 0;
 }
 static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2568,36 +2501,35 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
 static int
 nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                 int flags, struct nameidata *nd)
+                 int flags, struct nfs_open_context *ctx)
 {
-        struct path path = {
+        struct path my_path = {
-                .mnt = nd->path.mnt,
                .dentry = dentry,
        };
+        struct path *path = &my_path;
        struct nfs4_state *state;
-        struct rpc_cred *cred;
+        struct rpc_cred *cred = NULL;
-        fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
+        fmode_t fmode = 0;
        int status = 0;
-        cred = rpc_lookup_cred();
+        if (ctx != NULL) {
-        if (IS_ERR(cred)) {
+                cred = ctx->cred;
-                status = PTR_ERR(cred);
+                path = &ctx->path;
-                goto out;
+                fmode = ctx->mode;
        }
-        state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
+        sattr->ia_mode &= ~current_umask();
+        state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
-                goto out_putcred;
+                goto out;
        }
        d_add(dentry, igrab(state->inode));
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-        if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
+        if (ctx != NULL)
-                status = nfs4_intent_set_file(nd, &path, state, fmode);
+                ctx->state = state;
        else
-                nfs4_close_sync(&path, state, fmode);
+                nfs4_close_sync(path, state, fmode);
-out_putcred:
-        put_rpccred(cred);
 out:
        return status;
 }
@@ -2655,6 +2587,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
        args->bitmask = server->cache_consistency_bitmask;
        res->server = server;
+        res->seq_res.sr_slot = NULL;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
@@ -2671,18 +2604,46 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        return 1;
 }
+static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs_renameargs *arg = msg->rpc_argp;
+        struct nfs_renameres *res = msg->rpc_resp;
+        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
+        arg->bitmask = server->attr_bitmask;
+        res->server = server;
+}
+static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+                                 struct inode *new_dir)
+{
+        struct nfs_renameres *res = task->tk_msg.rpc_resp;
+        if (!nfs4_sequence_done(task, &res->seq_res))
+                return 0;
+        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+                return 0;
+        update_changeattr(old_dir, &res->old_cinfo);
+        nfs_post_op_update_inode(old_dir, res->old_fattr);
+        update_changeattr(new_dir, &res->new_cinfo);
+        nfs_post_op_update_inode(new_dir, res->new_fattr);
+        return 1;
+}
 static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
                struct inode *new_dir, struct qstr *new_name)
 {
        struct nfs_server *server = NFS_SERVER(old_dir);
-        struct nfs4_rename_arg arg = {
+        struct nfs_renameargs arg = {
                .old_dir = NFS_FH(old_dir),
                .new_dir = NFS_FH(new_dir),
                .old_name = old_name,
                .new_name = new_name,
                .bitmask = server->attr_bitmask,
        };
-        struct nfs4_rename_res res = {
+        struct nfs_renameres res = {
                .server = server,
        };
        struct rpc_message msg = {
@@ -2887,6 +2848,8 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 {
        struct nfs4_exception exception = { };
        int err;
+        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_mkdir(dir, dentry, sattr),
@@ -2896,15 +2859,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 }
 static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+                u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
        struct nfs4_readdir_arg args = {
                .fh = NFS_FH(dir),
-                .pages = &page,
+                .pages = pages,
                .pgbase = 0,
                .count = count,
                .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+                .plus = plus,
        };
        struct nfs4_readdir_res res;
        struct rpc_message msg = {
@@ -2922,8 +2886,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
        res.pgbase = args.pgbase;
        status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
-        if (status == 0)
+        if (status >= 0) {
                memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+                status += args.pgbase;
+        }
        nfs_invalidate_atime(dir);
@@ -2932,14 +2898,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 }
 static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+                u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
                                _nfs4_proc_readdir(dentry, cred, cookie,
-                                        page, count, plus),
+                                        pages, count, plus),
                                &exception);
        } while (exception.retry);
        return err;
@@ -2984,6 +2950,8 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 {
        struct nfs4_exception exception = { };
        int err;
+        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_mknod(dir, dentry, sattr, rdev),
@@ -3429,6 +3397,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
        ret = nfs_revalidate_inode(server, inode);
        if (ret < 0)
                return ret;
+        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+                nfs_zap_acl_cache(inode);
        ret = nfs4_read_cached_acl(inode, buf, buflen);
        if (ret != -ENOENT)
                return ret;
@@ -3457,6 +3427,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
        nfs_inode_return_delegation(inode);
        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
        ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        /*
+         * Acl update can result in inode attribute update.
+         * so mark the attribute cache invalid.
+         */
+        spin_lock(&inode->i_lock);
+        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+        spin_unlock(&inode->i_lock);
        nfs_access_zap_cache(inode);
        nfs_zap_acl_cache(inode);
        return ret;
@@ -3490,9 +3467,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                        nfs4_state_mark_reclaim_nograce(clp, state);
                        goto do_state_recovery;
                case -NFS4ERR_STALE_STATEID:
-                        if (state == NULL)
-                                break;
-                        nfs4_state_mark_reclaim_reboot(clp, state);
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
                        goto do_state_recovery;
@@ -3540,6 +3514,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        struct nfs4_setclientid setclientid = {
                .sc_verifier = &sc_verifier,
                .sc_prog = program,
+                .sc_cb_ident = clp->cl_cb_ident,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -3579,7 +3554,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                if (signalled())
                        break;
                if (loop++ & 1)
-                        ssleep(clp->cl_lease_time + 1);
+                        ssleep(clp->cl_lease_time / HZ + 1);
                else
                        if (++clp->cl_id_uniquifier == 0)
                                break;
@@ -3626,7 +3601,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
                        case -NFS4ERR_RESOURCE:
                                /* The IBM lawyers misread another document! */
                        case -NFS4ERR_DELAY:
-                        case -EKEYEXPIRED:
                                err = nfs4_delay(clp->cl_rpcclient, &timeout);
                }
        } while (err == 0);
@@ -3721,14 +3695,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        memcpy(&data->stateid, stateid, sizeof(data->stateid));
        data->res.fattr = &data->fattr;
        data->res.server = server;
-        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
        task_setup_data.callback_data = data;
-        msg.rpc_argp = &data->args,
+        msg.rpc_argp = &data->args;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -3807,6 +3780,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
        arg.lock_owner.id = lsp->ls_id.id;
+        arg.lock_owner.s_dev = server->s_dev;
        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        switch (status) {
                case 0:
@@ -3874,7 +3848,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        p->arg.fl = &p->fl;
        p->arg.seqid = seqid;
        p->res.seqid = seqid;
-        p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        p->arg.stateid = &lsp->ls_stateid;
        p->lsp = lsp;
        atomic_inc(&lsp->ls_count);
@@ -3973,8 +3946,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
                return ERR_PTR(-ENOMEM);
        }
-        msg.rpc_argp = &data->arg,
+        msg.rpc_argp = &data->arg;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
        return rpc_run_task(&task_setup_data);
 }
@@ -4053,8 +4026,8 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.lock_stateid = &lsp->ls_stateid;
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id.id;
+        p->arg.lock_owner.s_dev = server->s_dev;
        p->res.lock_seqid = p->arg.lock_seqid;
-        p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        p->lsp = lsp;
        p->server = server;
        atomic_inc(&lsp->ls_count);
@@ -4211,8 +4184,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                        data->arg.reclaim = NFS_LOCK_RECLAIM;
                task_setup_data.callback_ops = &nfs4_recover_lock_ops;
        }
-        msg.rpc_argp = &data->arg,
+        msg.rpc_argp = &data->arg;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -4241,7 +4214,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
                if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
                        return 0;
                err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
-                if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
+                if (err != -NFS4ERR_DELAY)
                        break;
                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
@@ -4266,7 +4239,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
                        goto out;
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
-                case -EKEYEXPIRED:
                        nfs4_handle_exception(server, err, &exception);
                        err = 0;
                }
@@ -4412,13 +4384,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
                                err = 0;
                                goto out;
+                        case -EKEYEXPIRED:
+                                /*
+                                 * User RPCSEC_GSS context has expired.
+                                 * We cannot recover this stateid now, so
+                                 * skip it and allow recovery thread to
+                                 * proceed.
+                                 */
+                                err = 0;
+                                goto out;
                        case -ENOMEM:
                        case -NFS4ERR_DENIED:
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
                                err = 0;
                                goto out;
                        case -NFS4ERR_DELAY:
-                        case -EKEYEXPIRED:
                                break;
                }
                err = nfs4_handle_exception(server, err, &exception);
@@ -4451,48 +4431,43 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
                return;
        args->lock_owner.clientid = server->nfs_client->cl_clientid;
        args->lock_owner.id = lsp->ls_id.id;
+        args->lock_owner.s_dev = server->s_dev;
        msg.rpc_argp = args;
        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
 }
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
-int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
+static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
-                size_t buflen, int flags)
+                                   const void *buf, size_t buflen,
+                                   int flags, int type)
 {
-        struct inode *inode = dentry->d_inode;
+        if (strcmp(key, "") != 0)
+                return -EINVAL;
-        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-                return -EOPNOTSUPP;
-        return nfs4_proc_set_acl(inode, buf, buflen);
+        return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
 }
-/* The getxattr man page suggests returning -ENODATA for unknown attributes,
+static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
- * and that's what we'll do for e.g. user attributes that haven't been set.
+                                   void *buf, size_t buflen, int type)
- * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
- * attributes in kernel-managed attribute namespaces. */
-ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
-                size_t buflen)
 {
-        struct inode *inode = dentry->d_inode;
+        if (strcmp(key, "") != 0)
+                return -EINVAL;
-        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-                return -EOPNOTSUPP;
-        return nfs4_proc_get_acl(inode, buf, buflen);
+        return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
 }
-ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
+static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
+                                       size_t list_len, const char *name,
+                                       size_t name_len, int type)
 {
-        size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1;
+        size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
        if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
                return 0;
-        if (buf && buflen < len)
-                return -ERANGE;
+        if (list && len <= list_len)
-        if (buf)
+                memcpy(list, XATTR_NAME_NFSV4_ACL, len);
-                memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
        return len;
 }
@@ -4545,6 +4520,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 #ifdef CONFIG_NFS_V4_1
 /*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags)
+{
+        if (flags & ~EXCHGID4_FLAG_MASK_R)
+                goto out_inval;
+        if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+            (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+                goto out_inval;
+        if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+                goto out_inval;
+        return NFS_OK;
+out_inval:
+        return -NFS4ERR_INVAL;
+}
+/*
 * nfs4_proc_exchange_id()
 *
 * Since the clientid has expired, all compounds using sessions
@@ -4557,7 +4551,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        nfs4_verifier verifier;
        struct nfs41_exchange_id_args args = {
                .client = clp,
-                .flags = clp->cl_exchange_flags,
+                .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
        };
        struct nfs41_exchange_id_res res = {
                .client = clp,
@@ -4574,34 +4568,21 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        dprintk("--> %s\n", __func__);
        BUG_ON(clp == NULL);
-        /* Remove server-only flags */
-        args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
        p = (u32 *)verifier.data;
        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
        args.verifier = &verifier;
-        while (1) {
+        args.id_len = scnprintf(args.id, sizeof(args.id),
-                args.id_len = scnprintf(args.id, sizeof(args.id),
+                                "%s/%s.%s/%u",
-                                        "%s/%s %u",
+                                clp->cl_ipaddr,
-                                        clp->cl_ipaddr,
+                                init_utsname()->nodename,
-                                        rpc_peeraddr2str(clp->cl_rpcclient,
+                                init_utsname()->domainname,
-                                                         RPC_DISPLAY_ADDR),
+                                clp->cl_rpcclient->cl_auth->au_flavor);
-                                        clp->cl_id_uniquifier);
-                status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
-                if (status != -NFS4ERR_CLID_INUSE)
-                        break;
-                if (signalled())
-                        break;
-                if (++clp->cl_id_uniquifier == 0)
-                        break;
-        }
+        status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+        if (!status)
+                status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
        dprintk("<-- %s status= %d\n", __func__, status);
        return status;
 }
@@ -4647,7 +4628,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case -NFS4ERR_DELAY:
        case -NFS4ERR_GRACE:
-        case -EKEYEXPIRED:
                dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
                rpc_delay(task, NFS4_POLL_RETRY_MIN);
                task->tk_status = 0;
@@ -4687,7 +4667,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
        };
        int status;
-        res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        dprintk("--> %s\n", __func__);
        task = rpc_run_task(&task_setup);
@@ -4837,17 +4816,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        if (!session)
                return NULL;
-        init_completion(&session->complete);
        tbl = &session->fc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+        init_completion(&tbl->complete);
        tbl = &session->bc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+        init_completion(&tbl->complete);
        session->session_state = 1<<NFS4_SESSION_INITING;
@@ -4914,49 +4893,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
                args->bc_attrs.max_reqs);
 }
-static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
 {
-        if (rcvd <= sent)
+        struct nfs4_channel_attrs *sent = &args->fc_attrs;
-                return 0;
+        struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
-        printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. "
-                "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd);
+        if (rcvd->headerpadsz > sent->headerpadsz)
-        return -EINVAL;
+                return -EINVAL;
+        if (rcvd->max_resp_sz > sent->max_resp_sz)
+                return -EINVAL;
+        /*
+         * Our requested max_ops is the minimum we need; we're not
+         * prepared to break up compounds into smaller pieces than that.
+         * So, no point even trying to continue if the server won't
+         * cooperate:
+         */
+        if (rcvd->max_ops < sent->max_ops)
+                return -EINVAL;
+        if (rcvd->max_reqs == 0)
+                return -EINVAL;
+        return 0;
 }
-#define _verify_fore_channel_attr(_name_) \
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
-        _verify_channel_attr("fore", #_name_, \
+{
-                             args->fc_attrs._name_, \
+        struct nfs4_channel_attrs *sent = &args->bc_attrs;
-                             session->fc_attrs._name_)
+        struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
-#define _verify_back_channel_attr(_name_) \
+        if (rcvd->max_rqst_sz > sent->max_rqst_sz)
-        _verify_channel_attr("back", #_name_, \
+                return -EINVAL;
-                             args->bc_attrs._name_, \
+        if (rcvd->max_resp_sz < sent->max_resp_sz)
-                             session->bc_attrs._name_)
+                return -EINVAL;
+        if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
+                return -EINVAL;
+        /* These would render the backchannel useless: */
+        if (rcvd->max_ops  == 0)
+                return -EINVAL;
+        if (rcvd->max_reqs == 0)
+                return -EINVAL;
+        return 0;
+}
-/*
- * The server is not allowed to increase the fore channel header pad size,
- * maximum response size, or maximum number of operations.
- *
- * The back channel attributes are only negotiatied down: We send what the
- * (back channel) server insists upon.
- */
 static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
                                     struct nfs4_session *session)
 {
-        int ret = 0;
+        int ret;
-        ret |= _verify_fore_channel_attr(headerpadsz);
-        ret |= _verify_fore_channel_attr(max_resp_sz);
-        ret |= _verify_fore_channel_attr(max_ops);
-        ret |= _verify_back_channel_attr(headerpadsz);
-        ret |= _verify_back_channel_attr(max_rqst_sz);
-        ret |= _verify_back_channel_attr(max_resp_sz);
-        ret |= _verify_back_channel_attr(max_resp_sz_cached);
-        ret |= _verify_back_channel_attr(max_ops);
-        ret |= _verify_back_channel_attr(max_reqs);
-        return ret;
+        ret = nfs4_verify_fore_channel_attrs(args, session);
+        if (ret)
+                return ret;
+        return nfs4_verify_back_channel_attrs(args, session);
 }
 static int _nfs4_proc_create_session(struct nfs_client *clp)
@@ -5111,7 +5097,6 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
 {
        switch(task->tk_status) {
        case -NFS4ERR_DELAY:
-        case -EKEYEXPIRED:
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
@@ -5180,12 +5165,11 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
        if (!atomic_inc_not_zero(&clp->cl_count))
                return ERR_PTR(-EIO);
-        calldata = kmalloc(sizeof(*calldata), GFP_NOFS);
+        calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
        if (calldata == NULL) {
                nfs_put_client(clp);
                return ERR_PTR(-ENOMEM);
        }
-        calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        msg.rpc_argp = &calldata->args;
        msg.rpc_resp = &calldata->res;
        calldata->clp = clp;
@@ -5254,7 +5238,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
        case -NFS4ERR_WRONG_CRED: /* What to do here? */
                break;
        case -NFS4ERR_DELAY:
-        case -EKEYEXPIRED:
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
@@ -5317,7 +5300,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
                goto out;
        calldata->clp = clp;
        calldata->arg.one_fs = 0;
-        calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        msg.rpc_argp = &calldata->arg;
        msg.rpc_resp = &calldata->res;
@@ -5333,6 +5315,152 @@ out:
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
 }
+static void
+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_layoutget *lgp = calldata;
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        dprintk("--> %s\n", __func__);
+        /* Note the is a race here, where a CB_LAYOUTRECALL can come in
+         * right now covering the LAYOUTGET we are about to send.
+         * However, that is not so catastrophic, and there seems
+         * to be no way to prevent it completely.
+         */
+        if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+                                &lgp->res.seq_res, 0, task))
+                return;
+        if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+                                          NFS_I(lgp->args.inode)->layout,
+                                          lgp->args.ctx->state)) {
+                rpc_exit(task, NFS4_OK);
+                return;
+        }
+        rpc_call_start(task);
+}
+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_layoutget *lgp = calldata;
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        dprintk("--> %s\n", __func__);
+        if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+                return;
+        switch (task->tk_status) {
+        case 0:
+                break;
+        case -NFS4ERR_LAYOUTTRYLATER:
+        case -NFS4ERR_RECALLCONFLICT:
+                task->tk_status = -NFS4ERR_DELAY;
+                /* Fall through */
+        default:
+                if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+                        rpc_restart_call_prepare(task);
+                        return;
+                }
+        }
+        dprintk("<-- %s\n", __func__);
+}
+static void nfs4_layoutget_release(void *calldata)
+{
+        struct nfs4_layoutget *lgp = calldata;
+        dprintk("--> %s\n", __func__);
+        if (lgp->res.layout.buf != NULL)
+                free_page((unsigned long) lgp->res.layout.buf);
+        put_nfs_open_context(lgp->args.ctx);
+        kfree(calldata);
+        dprintk("<-- %s\n", __func__);
+}
+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
+        .rpc_call_prepare = nfs4_layoutget_prepare,
+        .rpc_call_done = nfs4_layoutget_done,
+        .rpc_release = nfs4_layoutget_release,
+};
+int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
+{
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
+                .rpc_argp = &lgp->args,
+                .rpc_resp = &lgp->res,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = server->client,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_layoutget_call_ops,
+                .callback_data = lgp,
+                .flags = RPC_TASK_ASYNC,
+        };
+        int status = 0;
+        dprintk("--> %s\n", __func__);
+        lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
+        if (lgp->res.layout.buf == NULL) {
+                nfs4_layoutget_release(lgp);
+                return -ENOMEM;
+        }
+        lgp->res.seq_res.sr_slot = NULL;
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        status = nfs4_wait_for_completion_rpc_task(task);
+        if (status == 0)
+                status = task->tk_status;
+        if (status == 0)
+                status = pnfs_layout_process(lgp);
+        rpc_put_task(task);
+        dprintk("<-- %s status=%d\n", __func__, status);
+        return status;
+}
+static int
+_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+        struct nfs4_getdeviceinfo_args args = {
+                .pdev = pdev,
+        };
+        struct nfs4_getdeviceinfo_res res = {
+                .pdev = pdev,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        int status;
+        dprintk("--> %s\n", __func__);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
+        dprintk("<-- %s status=%d\n", __func__, status);
+        return status;
+}
+int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = nfs4_handle_exception(server,
+                                        _nfs4_proc_getdeviceinfo(server, pdev),
+                                        &exception);
+        } while (exception.retry);
+        return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
 #endif /* CONFIG_NFS_V4_1 */
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5421,9 +5549,10 @@ static const struct inode_operations nfs4_file_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .getxattr       = nfs4_getxattr,
+        .getxattr       = generic_getxattr,
-        .setxattr       = nfs4_setxattr,
+        .setxattr       = generic_setxattr,
-        .listxattr      = nfs4_listxattr,
+        .listxattr      = generic_listxattr,
+        .removexattr    = generic_removexattr,
 };
 const struct nfs_rpc_ops nfs_v4_clientops = {
@@ -5443,6 +5572,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .unlink_setup   = nfs4_proc_unlink_setup,
        .unlink_done    = nfs4_proc_unlink_done,
        .rename         = nfs4_proc_rename,
+        .rename_setup   = nfs4_proc_rename_setup,
+        .rename_done    = nfs4_proc_rename_done,
        .link           = nfs4_proc_link,
        .symlink        = nfs4_proc_symlink,
        .mkdir          = nfs4_proc_mkdir,
@@ -5463,6 +5594,19 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .lock           = nfs4_proc_lock,
        .clear_acl_cache = nfs4_zap_acl_attr,
        .close_context  = nfs4_close_context,
+        .open_context   = nfs4_atomic_open,
+};
+static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
+        .prefix = XATTR_NAME_NFSV4_ACL,
+        .list   = nfs4_xattr_list_nfs4_acl,
+        .get    = nfs4_xattr_get_nfs4_acl,
+        .set    = nfs4_xattr_set_nfs4_acl,
+};
+const struct xattr_handler *nfs4_xattr_handlers[] = {
+        &nfs4_xattr_nfs4_acl_handler,
+        NULL
 };
 /*
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 72b6c580af13..402143d75fc5 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -63,9 +63,14 @@ nfs4_renew_state(struct work_struct *work)
        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
-        /* Are there any active superblocks? */
-        if (list_empty(&clp->cl_superblocks))
+        rcu_read_lock();
+        if (list_empty(&clp->cl_superblocks)) {
+                rcu_read_unlock();
                goto out;
+        }
+        rcu_read_unlock();
        spin_lock(&clp->cl_lock);
        lease = clp->cl_lease_time;
        last = clp->cl_last_renewal;
@@ -75,7 +80,7 @@ nfs4_renew_state(struct work_struct *work)
                cred = ops->get_state_renewal_cred_locked(clp);
                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
-                        if (list_empty(&clp->cl_delegations)) {
+                        if (!nfs_delegations_present(clp)) {
                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                                goto out;
                        }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3e2f19b04c06..e6742b57a04c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -40,12 +40,13 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/fs.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/ratelimit.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
@@ -53,6 +54,7 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 #define OPENOWNER_POOL_SIZE     8
@@ -103,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
                put_rpccred(cred);
 }
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
 {
+        struct rpc_cred *cred = NULL;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
-        struct rpc_cred *cred = NULL;
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        for (pos = rb_first(&server->state_owners);
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+             pos != NULL;
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                if (list_empty(&sp->so_states))
                        continue;
                cred = get_rpccred(sp->so_cred);
@@ -119,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
        return cred;
 }
+/**
+ * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+{
+        struct rpc_cred *cred = NULL;
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                cred = nfs4_get_renew_cred_server_locked(server);
+                if (cred != NULL)
+                        break;
+        }
+        rcu_read_unlock();
+        return cred;
+}
 #if defined(CONFIG_NFS_V4_1)
 static int nfs41_setup_state_renewal(struct nfs_client *clp)
@@ -140,6 +167,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        return status;
 }
+/*
+ * Back channel returns NFS4ERR_DELAY for new requests when
+ * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
+ * is ended.
+ */
 static void nfs4_end_drain_session(struct nfs_client *clp)
 {
        struct nfs4_session *ses = clp->cl_session;
@@ -163,22 +195,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
        }
 }
-static int nfs4_begin_drain_session(struct nfs_client *clp)
+static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
 {
-        struct nfs4_session *ses = clp->cl_session;
-        struct nfs4_slot_table *tbl = &ses->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
        if (tbl->highest_used_slotid != -1) {
-                INIT_COMPLETION(ses->complete);
+                INIT_COMPLETION(tbl->complete);
                spin_unlock(&tbl->slot_tbl_lock);
-                return wait_for_completion_interruptible(&ses->complete);
+                return wait_for_completion_interruptible(&tbl->complete);
        }
        spin_unlock(&tbl->slot_tbl_lock);
        return 0;
 }
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+        struct nfs4_session *ses = clp->cl_session;
+        int ret = 0;
+        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
+        /* back channel */
+        ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+        if (ret)
+                return ret;
+        /* fore channel */
+        return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+}
 int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
        int status;
@@ -208,28 +250,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 #endif /* CONFIG_NFS_V4_1 */
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_setclientid_cred_server(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
+        struct rpc_cred *cred = NULL;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
+        spin_lock(&clp->cl_lock);
+        pos = rb_first(&server->state_owners);
+        if (pos != NULL) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+                cred = get_rpccred(sp->so_cred);
+        }
+        spin_unlock(&clp->cl_lock);
+        return cred;
+}
+/**
+ * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+{
+        struct nfs_server *server;
        struct rpc_cred *cred;
        spin_lock(&clp->cl_lock);
        cred = nfs4_get_machine_cred_locked(clp);
+        spin_unlock(&clp->cl_lock);
        if (cred != NULL)
                goto out;
-        pos = rb_first(&clp->cl_state_owners);
-        if (pos != NULL) {
+        rcu_read_lock();
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                cred = get_rpccred(sp->so_cred);
+                cred = nfs4_get_setclientid_cred_server(server);
+                if (cred != NULL)
+                        break;
        }
+        rcu_read_unlock();
 out:
-        spin_unlock(&clp->cl_lock);
        return cred;
 }
-static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new,
+static void nfs_alloc_unique_id_locked(struct rb_root *root,
-                __u64 minval, int maxbits)
+                                       struct nfs_unique_id *new,
+                                       __u64 minval, int maxbits)
 {
        struct rb_node **p, *parent;
        struct nfs_unique_id *pos;
@@ -284,16 +354,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
 }
 static struct nfs4_state_owner *
-nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
-        struct nfs_client *clp = server->nfs_client;
+        struct rb_node **p = &server->state_owners.rb_node,
-        struct rb_node **p = &clp->cl_state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp, *res = NULL;
        while (*p != NULL) {
                parent = *p;
-                sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
                if (server < sp->so_server) {
                        p = &parent->rb_left;
@@ -317,24 +386,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 }
 static struct nfs4_state_owner *
-nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
 {
-        struct rb_node **p = &clp->cl_state_owners.rb_node,
+        struct nfs_server *server = new->so_server;
+        struct rb_node **p = &server->state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp;
        while (*p != NULL) {
                parent = *p;
-                sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
-                if (new->so_server < sp->so_server) {
-                        p = &parent->rb_left;
-                        continue;
-                }
-                if (new->so_server > sp->so_server) {
-                        p = &parent->rb_right;
-                        continue;
-                }
                if (new->so_cred < sp->so_cred)
                        p = &parent->rb_left;
                else if (new->so_cred > sp->so_cred)
@@ -344,18 +406,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
                        return sp;
                }
        }
-        nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64);
+        nfs_alloc_unique_id_locked(&server->openowner_id,
-        rb_link_node(&new->so_client_node, parent, p);
+                                        &new->so_owner_id, 1, 64);
-        rb_insert_color(&new->so_client_node, &clp->cl_state_owners);
+        rb_link_node(&new->so_server_node, parent, p);
+        rb_insert_color(&new->so_server_node, &server->state_owners);
        return new;
 }
 static void
-nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp)
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
 {
-        if (!RB_EMPTY_NODE(&sp->so_client_node))
+        struct nfs_server *server = sp->so_server;
-                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
-        nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id);
+        if (!RB_EMPTY_NODE(&sp->so_server_node))
+                rb_erase(&sp->so_server_node, &server->state_owners);
+        nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
 }
 /*
@@ -384,23 +449,32 @@ nfs4_alloc_state_owner(void)
 static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-        if (!RB_EMPTY_NODE(&sp->so_client_node)) {
+        if (!RB_EMPTY_NODE(&sp->so_server_node)) {
-                struct nfs_client *clp = sp->so_server->nfs_client;
+                struct nfs_server *server = sp->so_server;
+                struct nfs_client *clp = server->nfs_client;
                spin_lock(&clp->cl_lock);
-                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
+                rb_erase(&sp->so_server_node, &server->state_owners);
-                RB_CLEAR_NODE(&sp->so_client_node);
+                RB_CLEAR_NODE(&sp->so_server_node);
                spin_unlock(&clp->cl_lock);
        }
 }
-struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+                                              struct rpc_cred *cred)
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp, *new;
        spin_lock(&clp->cl_lock);
-        sp = nfs4_find_state_owner(server, cred);
+        sp = nfs4_find_state_owner_locked(server, cred);
        spin_unlock(&clp->cl_lock);
        if (sp != NULL)
                return sp;
@@ -410,7 +484,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        new->so_server = server;
        new->so_cred = cred;
        spin_lock(&clp->cl_lock);
-        sp = nfs4_insert_state_owner(clp, new);
+        sp = nfs4_insert_state_owner_locked(new);
        spin_unlock(&clp->cl_lock);
        if (sp == new)
                get_rpccred(cred);
@@ -421,6 +495,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        return sp;
 }
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
        struct nfs_client *clp = sp->so_server->nfs_client;
@@ -428,7 +507,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
        if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
                return;
-        nfs4_remove_state_owner(clp, sp);
+        nfs4_remove_state_owner_locked(sp);
        spin_unlock(&clp->cl_lock);
        rpc_destroy_wait_queue(&sp->so_sequence.wait);
        put_rpccred(cred);
@@ -583,8 +662,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
        if (!call_close) {
                nfs4_put_open_state(state);
                nfs4_put_state_owner(owner);
-        } else
+        } else {
-                nfs4_do_close(path, state, gfp_mask, wait);
+                bool roc = pnfs_roc(state->inode);
+                nfs4_do_close(path, state, gfp_mask, wait, roc);
+        }
 }
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
@@ -631,7 +713,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
 static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
        struct nfs4_lock_state *lsp;
-        struct nfs_client *clp = state->owner->so_server->nfs_client;
+        struct nfs_server *server = state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
        lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
        if (lsp == NULL)
@@ -655,7 +738,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
                return NULL;
        }
        spin_lock(&clp->cl_lock);
-        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
+        nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
        spin_unlock(&clp->cl_lock);
        INIT_LIST_HEAD(&lsp->ls_locks);
        return lsp;
@@ -663,10 +746,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 {
-        struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
+        struct nfs_server *server = lsp->ls_state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
        spin_lock(&clp->cl_lock);
-        nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
+        nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
        spin_unlock(&clp->cl_lock);
        rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
        kfree(lsp);
@@ -970,13 +1054,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
        /* Guard against delegation returns and new lock/unlock calls */
        down_write(&nfsi->rwsem);
        /* Protect inode->i_flock using the BKL */
-        lock_kernel();
+        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file)->state != state)
                        continue;
-                unlock_kernel();
+                unlock_flocks();
                status = ops->recover_lock(state, fl);
                switch (status) {
                        case 0:
@@ -1003,9 +1087,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
                                status = 0;
                }
-                lock_kernel();
+                lock_flocks();
        }
-        unlock_kernel();
+        unlock_flocks();
 out:
        up_write(&nfsi->rwsem);
        return status;
@@ -1063,6 +1147,14 @@ restart:
                                /* Mark the file as being 'closed' */
                                state->state = 0;
                                break;
+                        case -EKEYEXPIRED:
+                                /*
+                                 * User RPCSEC_GSS context has expired.
+                                 * We cannot recover this stateid now, so
+                                 * skip it and allow recovery thread to
+                                 * proceed.
+                                 */
+                                break;
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_BAD_STATEID:
@@ -1104,15 +1196,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
        }
 }
-static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+static void nfs4_reset_seqids(struct nfs_server *server,
+        int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
 {
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        /* Reset all sequence ids to zero */
+        spin_lock(&clp->cl_lock);
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        for (pos = rb_first(&server->state_owners);
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+             pos != NULL;
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                sp->so_seqid.flags = 0;
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
@@ -1121,6 +1217,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re
                }
                spin_unlock(&sp->so_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+        int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs4_reset_seqids(server, mark_reclaim);
+        rcu_read_unlock();
 }
 static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
@@ -1138,29 +1246,51 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
                (void)ops->reclaim_complete(clp);
 }
-static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+        spin_lock(&clp->cl_lock);
-                return;
+        for (pos = rb_first(&server->state_owners);
+             pos != NULL;
-        nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
-                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+                                                &state->flags))
                                continue;
                        nfs4_state_mark_reclaim_nograce(clp, state);
                }
                spin_unlock(&sp->so_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+        struct nfs_server *server;
+        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+                return 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs4_clear_reclaim_server(server);
+        rcu_read_unlock();
        nfs_delegation_reap_unclaimed(clp);
+        return 1;
+}
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+{
+        if (!nfs4_state_clear_reclaim_reboot(clp))
+                return;
+        nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
 }
 static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1175,6 +1305,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
+static void nfs4_warn_keyexpired(const char *s)
+{
+        printk_ratelimited(KERN_WARNING "Error: state manager"
+                        " encountered RPCSEC_GSS session"
+                        " expired against NFSv4 server %s.\n",
+                        s);
+}
 static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
        switch (error) {
@@ -1187,7 +1325,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_LEASE_MOVED:
                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-                        nfs4_state_end_reclaim_reboot(clp);
+                        nfs4_state_clear_reclaim_reboot(clp);
                        nfs4_state_start_reclaim_reboot(clp);
                        break;
                case -NFS4ERR_EXPIRED:
@@ -1204,33 +1342,50 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                        set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
                        /* Zero session reset errors */
                        return 0;
+                case -EKEYEXPIRED:
+                        /* Nothing we can do */
+                        nfs4_warn_keyexpired(clp->cl_hostname);
+                        return 0;
        }
        return error;
 }
 static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
 {
+        struct nfs4_state_owner *sp;
+        struct nfs_server *server;
        struct rb_node *pos;
        int status = 0;
 restart:
-        spin_lock(&clp->cl_lock);
+        rcu_read_lock();
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                spin_lock(&clp->cl_lock);
-                if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
+                for (pos = rb_first(&server->state_owners);
-                        continue;
+                     pos != NULL;
-                atomic_inc(&sp->so_count);
+                     pos = rb_next(pos)) {
-                spin_unlock(&clp->cl_lock);
+                        sp = rb_entry(pos,
-                status = nfs4_reclaim_open_state(sp, ops);
+                                struct nfs4_state_owner, so_server_node);
-                if (status < 0) {
+                        if (!test_and_clear_bit(ops->owner_flag_bit,
-                        set_bit(ops->owner_flag_bit, &sp->so_flags);
+                                                        &sp->so_flags))
+                                continue;
+                        atomic_inc(&sp->so_count);
+                        spin_unlock(&clp->cl_lock);
+                        rcu_read_unlock();
+                        status = nfs4_reclaim_open_state(sp, ops);
+                        if (status < 0) {
+                                set_bit(ops->owner_flag_bit, &sp->so_flags);
+                                nfs4_put_state_owner(sp);
+                                return nfs4_recovery_handle_error(clp, status);
+                        }
                        nfs4_put_state_owner(sp);
-                        return nfs4_recovery_handle_error(clp, status);
+                        goto restart;
                }
-                nfs4_put_state_owner(sp);
+                spin_unlock(&clp->cl_lock);
-                goto restart;
        }
-        spin_unlock(&clp->cl_lock);
+        rcu_read_unlock();
        return status;
 }
@@ -1414,9 +1569,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
                case -NFS4ERR_DELAY:
                case -NFS4ERR_CLID_INUSE:
                case -EAGAIN:
-                case -EKEYEXPIRED:
                        break;
+                case -EKEYEXPIRED:
+                        nfs4_warn_keyexpired(clp->cl_hostname);
                case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
                                         * in nfs4_exchange_id */
                default:
@@ -1447,6 +1603,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        }
                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
                        set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+                        pnfs_destroy_all_layouts(clp);
                }
                if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 08ef91291132..4e2c168b6ee9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
 #include <linux/nfs_idmap.h>
 #include "nfs4_fs.h"
 #include "internal.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_XDR
@@ -70,8 +71,8 @@ static int nfs4_stat_to_errno(int);
 /* lock,open owner id:
 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
 */
-#define open_owner_id_maxsz     (1 + 4)
+#define open_owner_id_maxsz     (1 + 1 + 4)
-#define lock_owner_id_maxsz     (1 + 4)
+#define lock_owner_id_maxsz     (1 + 1 + 4)
 #define decode_lockowner_maxsz  (1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define compound_encode_hdr_maxsz       (3 + (NFS4_MAXTAGLEN >> 2))
 #define compound_decode_hdr_maxsz       (3 + (NFS4_MAXTAGLEN >> 2))
@@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int);
                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz   (op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz   (op_decode_hdr_maxsz + 4)
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
+                                XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+                                1 /* layout type */ + \
+                                1 /* opaque devaddr4 length */ + \
+                                  /* devaddr4 payload is read into page */ \
+                                1 /* notification bitmap length */ + \
+                                1 /* notification bitmap */)
+#define encode_layoutget_maxsz  (op_encode_hdr_maxsz + 10 + \
+                                encode_stateid_maxsz)
+#define decode_layoutget_maxsz  (op_decode_hdr_maxsz + 8 + \
+                                decode_stateid_maxsz + \
+                                XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz   0
 #define decode_sequence_maxsz   0
@@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz    (compound_decode_hdr_maxsz + \
                                         decode_sequence_maxsz + \
                                         decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
+                                encode_sequence_maxsz +\
+                                encode_getdeviceinfo_maxsz)
+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz +    \
+                                decode_sequence_maxsz + \
+                                decode_getdeviceinfo_maxsz)
+#define NFS4_enc_layoutget_sz   (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
+                                encode_putfh_maxsz +        \
+                                encode_layoutget_maxsz)
+#define NFS4_dec_layoutget_sz   (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
+                                decode_putfh_maxsz +        \
+                                decode_layoutget_maxsz)
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                      compound_encode_hdr_maxsz +
@@ -816,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
        if (iap->ia_valid & ATTR_MODE)
                len += 4;
        if (iap->ia_valid & ATTR_UID) {
-                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
+                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
                if (owner_namelen < 0) {
                        dprintk("nfs: couldn't resolve uid %d to string\n",
                                        iap->ia_uid);
@@ -828,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
        }
        if (iap->ia_valid & ATTR_GID) {
-                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
+                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
                if (owner_grouplen < 0) {
                        dprintk("nfs: couldn't resolve gid %d to string\n",
                                        iap->ia_gid);
@@ -1060,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo
 {
        __be32 *p;
-        p = reserve_space(xdr, 28);
+        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, lowner->clientid);
-        *p++ = cpu_to_be32(16);
+        *p++ = cpu_to_be32(20);
        p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+        *p++ = cpu_to_be32(lowner->s_dev);
        xdr_encode_hyper(p, lowner->id);
 }
@@ -1182,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
        *p++ = cpu_to_be32(OP_OPEN);
        *p = cpu_to_be32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
-        p = reserve_space(xdr, 28);
+        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, arg->clientid);
-        *p++ = cpu_to_be32(16);
+        *p++ = cpu_to_be32(20);
        p = xdr_encode_opaque_fixed(p, "open id:", 8);
+        *p++ = cpu_to_be32(arg->server->s_dev);
        xdr_encode_hyper(p, arg->id);
 }
@@ -1385,24 +1415,35 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
-        uint32_t attrs[2] = {
+        uint32_t attrs[2] = {0, 0};
-                FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
+        uint32_t dircount = readdir->count >> 1;
-                FATTR4_WORD1_MOUNTED_ON_FILEID,
-        };
        __be32 *p;
+        if (readdir->plus) {
+                attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
+                        FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE;
+                attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
+                        FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
+                        FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
+                        FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+                dircount >>= 1;
+        }
+        attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
+        attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+        /* Switch to mounted_on_fileid if the server supports it */
+        if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+                attrs[0] &= ~FATTR4_WORD0_FILEID;
+        else
+                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
        p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
        *p++ = cpu_to_be32(OP_READDIR);
        p = xdr_encode_hyper(p, readdir->cookie);
        p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
-        *p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
+        *p++ = cpu_to_be32(dircount);
        *p++ = cpu_to_be32(readdir->count);
        *p++ = cpu_to_be32(2);
-        /* Switch to mounted_on_fileid if the server supports it */
-        if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
-                attrs[0] &= ~FATTR4_WORD0_FILEID;
-        else
-                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
        *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
        *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
        hdr->nops++;
@@ -1471,7 +1512,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        hdr->replen += decode_restorefh_maxsz;
 }
-static int
+static void
 encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1482,14 +1523,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
        p = reserve_space(xdr, 2*4);
        *p++ = cpu_to_be32(1);
        *p = cpu_to_be32(FATTR4_WORD0_ACL);
-        if (arg->acl_len % 4)
+        BUG_ON(arg->acl_len % 4);
-                return -EINVAL;
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
        hdr->nops++;
        hdr->replen += decode_setacl_maxsz;
-        return 0;
 }
 static void
@@ -1726,6 +1765,55 @@ static void encode_sequence(struct xdr_stream *xdr,
 #endif /* CONFIG_NFS_V4_1 */
 }
+#ifdef CONFIG_NFS_V4_1
+static void
+encode_getdeviceinfo(struct xdr_stream *xdr,
+                     const struct nfs4_getdeviceinfo_args *args,
+                     struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
+        *p++ = cpu_to_be32(OP_GETDEVICEINFO);
+        p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
+                                    NFS4_DEVICEID4_SIZE);
+        *p++ = cpu_to_be32(args->pdev->layout_type);
+        *p++ = cpu_to_be32(args->pdev->pglen);          /* gdia_maxcount */
+        *p++ = cpu_to_be32(0);                          /* bitmap length 0 */
+        hdr->nops++;
+        hdr->replen += decode_getdeviceinfo_maxsz;
+}
+static void
+encode_layoutget(struct xdr_stream *xdr,
+                      const struct nfs4_layoutget_args *args,
+                      struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(OP_LAYOUTGET);
+        *p++ = cpu_to_be32(0);     /* Signal layout available */
+        *p++ = cpu_to_be32(args->type);
+        *p++ = cpu_to_be32(args->range.iomode);
+        p = xdr_encode_hyper(p, args->range.offset);
+        p = xdr_encode_hyper(p, args->range.length);
+        p = xdr_encode_hyper(p, args->minlength);
+        p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+        *p = cpu_to_be32(args->maxcount);
+        dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
+                __func__,
+                args->type,
+                args->range.iomode,
+                (unsigned long)args->range.offset,
+                (unsigned long)args->range.length,
+                args->maxcount);
+        hdr->nops++;
+        hdr->replen += decode_layoutget_maxsz;
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * END OF "GENERIC" ENCODE ROUTINES.
 */
@@ -1742,393 +1830,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 /*
 * Encode an ACCESS request
 */
-static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_accessargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_access(xdr, args->access, &hdr);
-        encode_access(&xdr, args->access, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LOOKUP request
 */
-static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_lookup_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_lookup(xdr, args->name, &hdr);
-        encode_lookup(&xdr, args->name, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LOOKUP_ROOT request
 */
-static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_lookup_root_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode REMOVE request
 */
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs_removeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_remove(xdr, &args->name, &hdr);
-        encode_remove(&xdr, &args->name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode RENAME request
 */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args)
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs_renameargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->old_dir, &hdr);
-        encode_putfh(&xdr, args->old_dir, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_putfh(xdr, args->new_dir, &hdr);
-        encode_putfh(&xdr, args->new_dir, &hdr);
+        encode_rename(xdr, args->old_name, args->new_name, &hdr);
-        encode_rename(&xdr, args->old_name, args->new_name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LINK request
 */
-static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+                             const struct nfs4_link_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_link(xdr, args->name, &hdr);
-        encode_link(&xdr, args->name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode CREATE request
 */
-static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_create_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_create(xdr, args, &hdr);
-        encode_create(&xdr, args, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode SYMLINK request
 */
-static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_create_arg *args)
 {
-        return nfs4_xdr_enc_create(req, p, args);
+        nfs4_xdr_enc_create(req, xdr, args);
 }
 /*
 * Encode GETATTR request
 */
-static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_getattr_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a CLOSE request
 */
-static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_close(xdr, args, &hdr);
-        encode_close(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN request
 */
-static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_openargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_open(xdr, args, &hdr);
-        encode_open(&xdr, args, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN_CONFIRM request
 */
-static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs_open_confirmargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open_confirm(xdr, args, &hdr);
-        encode_open_confirm(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN request with no attributes.
 */
-static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_openargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open(xdr, args, &hdr);
-        encode_open(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN_DOWNGRADE request
 */
-static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+                                        struct xdr_stream *xdr,
+                                        struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open_downgrade(xdr, args, &hdr);
-        encode_open_downgrade(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCK request
 */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_lock_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_lock(xdr, args, &hdr);
-        encode_lock(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCKT request
 */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_lockt_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_lockt(xdr, args, &hdr);
-        encode_lockt(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCKU request
 */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_locku_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_locku(xdr, args, &hdr);
-        encode_locku(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
-static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+                                           struct xdr_stream *xdr,
+                                        struct nfs_release_lockowner_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_release_lockowner(xdr, &args->lock_owner, &hdr);
-        encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READLINK request
 */
-static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  const struct nfs4_readlink *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_readlink(xdr, args, req, &hdr);
-        encode_readlink(&xdr, args, req, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
                        args->pgbase, args->pglen);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READDIR request
 */
-static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_readdir_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_readdir(xdr, args, req, &hdr);
-        encode_readdir(&xdr, args, req, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
                         args->pgbase, args->count);
@@ -2136,413 +2193,414 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
                        __func__, hdr.replen << 2, args->pages,
                        args->pgbase, args->count);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READ request
 */
-static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_readargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_read(xdr, args, &hdr);
-        encode_read(&xdr, args, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
                         args->pages, args->pgbase, args->count);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an SETATTR request
 */
-static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs_setattrargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_setattr(xdr, args, args->server, &hdr);
-        encode_setattr(&xdr, args, args->server, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a GETACL request
 */
-static int
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
+                                struct nfs_getaclargs *args)
-                struct nfs_getaclargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        uint32_t replen;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
        replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
-        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
+        encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
                args->acl_pages, args->acl_pgbase, args->acl_len);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a WRITE request
 */
-static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_writeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_write(xdr, args, &hdr);
-        encode_write(&xdr, args, &hdr);
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 *  a COMMIT request
 */
-static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs_writeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_commit(xdr, args, &hdr);
-        encode_commit(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * FSINFO request
 */
-static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs4_fsinfo_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_fsinfo(xdr, args->bitmask, &hdr);
-        encode_fsinfo(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a PATHCONF request
 */
-static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  const struct nfs4_pathconf_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
-        encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
                           &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a STATFS request
 */
-static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_statfs_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
-        encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
                           args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * GETATTR_BITMAP request
 */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
-                                    struct nfs4_server_caps_arg *args)
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_server_caps_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fhandle, &hdr);
-        encode_putfh(&xdr, args->fhandle, &hdr);
+        encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-        encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
                           FATTR4_WORD0_LINK_SUPPORT|
                           FATTR4_WORD0_SYMLINK_SUPPORT|
                           FATTR4_WORD0_ACLSUPPORT, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a RENEW request
 */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_client *clp)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_renew(xdr, clp, &hdr);
-        encode_renew(&xdr, clp, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SETCLIENTID request
 */
-static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_setclientid *sc)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_setclientid(xdr, sc, &hdr);
-        encode_setclientid(&xdr, sc, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+                                             struct xdr_stream *xdr,
+                                             struct nfs4_setclientid_res *arg)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_setclientid_confirm(xdr, arg, &hdr);
-        encode_setclientid_confirm(&xdr, arg, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(xdr, lease_bitmap, &hdr);
-        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * DELEGRETURN request
 */
-static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_delegreturnargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fhandle, &hdr);
-        encode_putfh(&xdr, args->fhandle, &hdr);
+        encode_delegreturn(xdr, args->stateid, &hdr);
-        encode_delegreturn(&xdr, args->stateid, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode FS_LOCATIONS request
 */
-static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs4_fs_locations_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        uint32_t replen;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_lookup(xdr, args->name, &hdr);
-        encode_lookup(&xdr, args->name, &hdr);
        replen = hdr.replen;    /* get the attribute into args->page */
-        encode_fs_locations(&xdr, args->bitmask, &hdr);
+        encode_fs_locations(xdr, args->bitmask, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
                        0, PAGE_SIZE);
        encode_nops(&hdr);
-        return 0;
 }
 #if defined(CONFIG_NFS_V4_1)
 /*
 * EXCHANGE_ID request
 */
-static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
-                                    struct nfs41_exchange_id_args *args)
+                                     struct xdr_stream *xdr,
+                                     struct nfs41_exchange_id_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = args->client->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_exchange_id(xdr, args, &hdr);
-        encode_exchange_id(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a CREATE_SESSION request
 */
-static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
-                                       struct nfs41_create_session_args *args)
+                                        struct xdr_stream *xdr,
+                                        struct nfs41_create_session_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = args->client->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_create_session(xdr, args, &hdr);
-        encode_create_session(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a DESTROY_SESSION request
 */
-static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
-                                        struct nfs4_session *session)
+                                         struct xdr_stream *xdr,
+                                         struct nfs4_session *session)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = session->clp->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_destroy_session(xdr, session, &hdr);
-        encode_destroy_session(&xdr, session, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SEQUENCE request
 */
-static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
-                                 struct nfs4_sequence_args *args)
+                                  struct nfs4_sequence_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, args, &hdr);
-        encode_sequence(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a GET_LEASE_TIME request
 */
-static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
-                                       struct nfs4_get_lease_time_args *args)
+                                        struct xdr_stream *xdr,
+                                        struct nfs4_get_lease_time_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->la_seq_args, &hdr);
-        encode_sequence(&xdr, &args->la_seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(xdr, lease_bitmap, &hdr);
-        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a RECLAIM_COMPLETE request
 */
-static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
-                                     struct nfs41_reclaim_complete_args *args)
+                                          struct xdr_stream *xdr,
+                                struct nfs41_reclaim_complete_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args)
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_reclaim_complete(xdr, args, &hdr);
-        encode_reclaim_complete(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
+/*
+ * Encode GETDEVICEINFO request
+ */
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
+                                       struct xdr_stream *xdr,
+                                       struct nfs4_getdeviceinfo_args *args)
+{
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        encode_compound_hdr(xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
+        encode_getdeviceinfo(xdr, args, &hdr);
+        /* set up reply kvec. Subtract notification bitmap max size (2)
+         * so that notification bitmap is put in xdr_buf tail */
+        xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
+                         args->pdev->pages, args->pdev->pgbase,
+                         args->pdev->pglen);
+        encode_nops(&hdr);
+}
+/*
+ *  Encode LAYOUTGET request
+ */
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs4_layoutget_args *args)
+{
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        encode_compound_hdr(xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+        encode_layoutget(xdr, args, &hdr);
+        encode_nops(&hdr);
+}
 #endif /* CONFIG_NFS_V4_1 */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2676,7 +2734,10 @@ out_overflow:
 static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
 {
        if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
-                decode_attr_bitmap(xdr, bitmask);
+                int ret;
+                ret = decode_attr_bitmap(xdr, bitmask);
+                if (unlikely(ret < 0))
+                        return ret;
                bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
        } else
                bitmask[0] = bitmask[1] = 0;
@@ -2848,6 +2909,56 @@ out_overflow:
        return -EIO;
 }
+static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+        __be32 *p;
+        if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
+                return -EIO;
+        if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
+{
+        __be32 *p;
+        int len;
+        if (fh != NULL)
+                memset(fh, 0, sizeof(*fh));
+        if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
+                return -EIO;
+        if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                len = be32_to_cpup(p);
+                if (len > NFS4_FHSIZE)
+                        return -EIO;
+                p = xdr_inline_decode(xdr, len);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (fh != NULL) {
+                        memcpy(fh->data, p, len);
+                        fh->size = len;
+                }
+                bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
        __be32 *p;
@@ -3521,6 +3632,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
        return status;
 }
+static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
+                                  struct timespec *time)
+{
+        int status = 0;
+        time->tv_sec = 0;
+        time->tv_nsec = 0;
+        if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
+                return -EIO;
+        if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
+                status = decode_attr_time(xdr, time);
+                bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
+        }
+        dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
+                (long)time->tv_nsec);
+        return status;
+}
 static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
 {
        int status = 0;
@@ -3744,29 +3873,14 @@ xdr_error:
        return status;
 }
-static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
+                struct nfs_fattr *fattr, struct nfs_fh *fh,
                const struct nfs_server *server, int may_sleep)
 {
-        __be32 *savep;
-        uint32_t attrlen,
-                 bitmap[2] = {0},
-                 type;
        int status;
        umode_t fmode = 0;
        uint64_t fileid;
+        uint32_t type;
-        status = decode_op_hdr(xdr, OP_GETATTR);
-        if (status < 0)
-                goto xdr_error;
-        status = decode_attr_bitmap(xdr, bitmap);
-        if (status < 0)
-                goto xdr_error;
-        status = decode_attr_length(xdr, &attrlen, &savep);
-        if (status < 0)
-                goto xdr_error;
        status = decode_attr_type(xdr, bitmap, &type);
        if (status < 0)
@@ -3792,6 +3906,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                goto xdr_error;
        fattr->valid |= status;
+        status = decode_attr_error(xdr, bitmap);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_attr_filehandle(xdr, bitmap, fh);
+        if (status < 0)
+                goto xdr_error;
        status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
        if (status < 0)
                goto xdr_error;
@@ -3862,12 +3984,101 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                fattr->valid |= status;
        }
+xdr_error:
+        dprintk("%s: xdr returned %d\n", __func__, -status);
+        return status;
+}
+static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+                struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+{
+        __be32 *savep;
+        uint32_t attrlen,
+                 bitmap[2] = {0};
+        int status;
+        status = decode_op_hdr(xdr, OP_GETATTR);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_attr_bitmap(xdr, bitmap);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_attr_length(xdr, &attrlen, &savep);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+        if (status < 0)
+                goto xdr_error;
        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
        dprintk("%s: xdr returned %d\n", __func__, -status);
        return status;
 }
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+                const struct nfs_server *server, int may_sleep)
+{
+        return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+}
+/*
+ * Decode potentially multiple layout types. Currently we only support
+ * one layout driver per file system.
+ */
+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
+                                         uint32_t *layouttype)
+{
+        uint32_t *p;
+        int num;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        num = be32_to_cpup(p);
+        /* pNFS is not supported by the underlying file system */
+        if (num == 0) {
+                *layouttype = 0;
+                return 0;
+        }
+        if (num > 1)
+                printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
+                        "per filesystem not supported\n", __func__);
+        /* Decode and set first layout type, move xdr->p past unused types */
+        p = xdr_inline_decode(xdr, num * 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        *layouttype = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * The type of file system exported.
+ * Note we must ensure that layouttype is set in any non-error case.
+ */
+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
+                                uint32_t *layouttype)
+{
+        int status = 0;
+        dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
+        if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
+                return -EIO;
+        if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+                status = decode_first_pnfs_layout_type(xdr, layouttype);
+                bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
+        } else
+                *layouttype = 0;
+        return status;
+}
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
@@ -3894,6 +4105,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
        if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
                goto xdr_error;
        fsinfo->wtpref = fsinfo->wtmax;
+        status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+        if (status != 0)
+                goto xdr_error;
+        status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+        if (status != 0)
+                goto xdr_error;
        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
@@ -3950,13 +4167,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
        __be32 *p;
        uint32_t namelen, type;
-        p = xdr_inline_decode(xdr, 32);
+        p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
        if (unlikely(!p))
                goto out_overflow;
-        p = xdr_decode_hyper(p, &offset);
+        p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
        p = xdr_decode_hyper(p, &length);
-        type = be32_to_cpup(p++);
+        type = be32_to_cpup(p++); /* 4 byte read */
-        if (fl != NULL) {
+        if (fl != NULL) { /* manipulate file lock */
                fl->fl_start = (loff_t)offset;
                fl->fl_end = fl->fl_start + (loff_t)length - 1;
                if (length == ~(uint64_t)0)
@@ -3966,9 +4183,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
                        fl->fl_type = F_RDLCK;
                fl->fl_pid = 0;
        }
-        p = xdr_decode_hyper(p, &clientid);
+        p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
-        namelen = be32_to_cpup(p);
+        namelen = be32_to_cpup(p); /* read 4 bytes */  /* have read all 32 bytes now */
-        p = xdr_inline_decode(xdr, namelen);
+        p = xdr_inline_decode(xdr, namelen); /* variable size field */
        if (likely(p))
                return -NFS4ERR_DENIED;
 out_overflow:
@@ -4180,7 +4397,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
                goto out_overflow;
        eof = be32_to_cpup(p++);
        count = be32_to_cpup(p);
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
                dprintk("NFS: server cheating in read reply: "
@@ -4200,12 +4417,9 @@ out_overflow:
 static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
 {
        struct xdr_buf  *rcvbuf = &req->rq_rcv_buf;
-        struct page     *page = *rcvbuf->pages;
        struct kvec     *iov = rcvbuf->head;
        size_t          hdrlen;
        u32             recvd, pglen = rcvbuf->page_len;
-        __be32          *end, *entry, *p, *kaddr;
-        unsigned int    nr = 0;
        int             status;
        status = decode_op_hdr(xdr, OP_READDIR);
@@ -4225,71 +4439,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
                pglen = recvd;
        xdr_read_pages(xdr, pglen);
-        BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE);
-        kaddr = p = kmap_atomic(page, KM_USER0);
+        return pglen;
-        end = p + ((pglen + readdir->pgbase) >> 2);
-        entry = p;
-        /* Make sure the packet actually has a value_follows and EOF entry */
-        if ((entry + 1) > end)
-                goto short_pkt;
-        for (; *p++; nr++) {
-                u32 len, attrlen, xlen;
-                if (end - p < 3)
-                        goto short_pkt;
-                dprintk("cookie = %Lu, ", *((unsigned long long *)p));
-                p += 2;                 /* cookie */
-                len = ntohl(*p++);      /* filename length */
-                if (len > NFS4_MAXNAMLEN) {
-                        dprintk("NFS: giant filename in readdir (len 0x%x)\n",
-                                        len);
-                        goto err_unmap;
-                }
-                xlen = XDR_QUADLEN(len);
-                if (end - p < xlen + 1)
-                        goto short_pkt;
-                dprintk("filename = %*s\n", len, (char *)p);
-                p += xlen;
-                len = ntohl(*p++);      /* bitmap length */
-                if (end - p < len + 1)
-                        goto short_pkt;
-                p += len;
-                attrlen = XDR_QUADLEN(ntohl(*p++));
-                if (end - p < attrlen + 2)
-                        goto short_pkt;
-                p += attrlen;           /* attributes */
-                entry = p;
-        }
-        /*
-         * Apparently some server sends responses that are a valid size, but
-         * contain no entries, and have value_follows==0 and EOF==0. For
-         * those, just set the EOF marker.
-         */
-        if (!nr && entry[1] == 0) {
-                dprintk("NFS: readdir reply truncated!\n");
-                entry[1] = 1;
-        }
-out:
-        kunmap_atomic(kaddr, KM_USER0);
-        return 0;
-short_pkt:
-        /*
-         * When we get a short packet there are 2 possibilities. We can
-         * return an error, or fix up the response to look like a valid
-         * response and return what we have so far. If there are no
-         * entries and the packet was short, then return -EIO. If there
-         * are valid entries in the response, return them and pretend that
-         * the call was successful, but incomplete. The caller can retry the
-         * readdir starting at the last cookie.
-         */
-        dprintk("%s: short packet at entry %d\n", __func__, nr);
-        entry[0] = entry[1] = 0;
-        if (nr)
-                goto out;
-err_unmap:
-        kunmap_atomic(kaddr, KM_USER0);
-        return -errno_NFSERR_IO;
 }
 static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -4299,7 +4450,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
        size_t hdrlen;
        u32 len, recvd;
        __be32 *p;
-        char *kaddr;
        int status;
        status = decode_op_hdr(xdr, OP_READLINK);
@@ -4330,9 +4480,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
         * and and null-terminate the text (the VFS expects
         * null-termination).
         */
-        kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
+        xdr_terminate_string(rcvbuf, len);
-        kaddr[len+rcvbuf->page_base] = '\0';
-        kunmap_atomic(kaddr, KM_USER0);
        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -4668,7 +4816,6 @@ static int decode_sequence(struct xdr_stream *xdr,
                           struct rpc_rqst *rqstp)
 {
 #if defined(CONFIG_NFS_V4_1)
-        struct nfs4_slot *slot;
        struct nfs4_sessionid id;
        u32 dummy;
        int status;
@@ -4700,15 +4847,14 @@ static int decode_sequence(struct xdr_stream *xdr,
                goto out_overflow;
        /* seqid */
-        slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
        dummy = be32_to_cpup(p++);
-        if (dummy != slot->seq_nr) {
+        if (dummy != res->sr_slot->seq_nr) {
                dprintk("%s Invalid sequence number\n", __func__);
                goto out_err;
        }
        /* slot id */
        dummy = be32_to_cpup(p++);
-        if (dummy != res->sr_slotid) {
+        if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
                dprintk("%s Invalid slot id\n", __func__);
                goto out_err;
        }
@@ -4731,6 +4877,134 @@ out_overflow:
 #endif /* CONFIG_NFS_V4_1 */
 }
+#if defined(CONFIG_NFS_V4_1)
+static int decode_getdeviceinfo(struct xdr_stream *xdr,
+                                struct pnfs_device *pdev)
+{
+        __be32 *p;
+        uint32_t len, type;
+        int status;
+        status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
+        if (status) {
+                if (status == -ETOOSMALL) {
+                        p = xdr_inline_decode(xdr, 4);
+                        if (unlikely(!p))
+                                goto out_overflow;
+                        pdev->mincount = be32_to_cpup(p);
+                        dprintk("%s: Min count too small. mincnt = %u\n",
+                                __func__, pdev->mincount);
+                }
+                return status;
+        }
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(!p))
+                goto out_overflow;
+        type = be32_to_cpup(p++);
+        if (type != pdev->layout_type) {
+                dprintk("%s: layout mismatch req: %u pdev: %u\n",
+                        __func__, pdev->layout_type, type);
+                return -EINVAL;
+        }
+        /*
+         * Get the length of the opaque device_addr4. xdr_read_pages places
+         * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
+         * and places the remaining xdr data in xdr_buf->tail
+         */
+        pdev->mincount = be32_to_cpup(p);
+        xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
+        /* Parse notification bitmap, verifying that it is zero. */
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        len = be32_to_cpup(p);
+        if (len) {
+                uint32_t i;
+                p = xdr_inline_decode(xdr, 4 * len);
+                if (unlikely(!p))
+                        goto out_overflow;
+                for (i = 0; i < len; i++, p++) {
+                        if (be32_to_cpup(p)) {
+                                dprintk("%s: notifications not supported\n",
+                                        __func__);
+                                return -EIO;
+                        }
+                }
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+                            struct nfs4_layoutget_res *res)
+{
+        __be32 *p;
+        int status;
+        u32 layout_count;
+        status = decode_op_hdr(xdr, OP_LAYOUTGET);
+        if (status)
+                return status;
+        p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
+        if (unlikely(!p))
+                goto out_overflow;
+        res->return_on_close = be32_to_cpup(p++);
+        p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
+        layout_count = be32_to_cpup(p);
+        if (!layout_count) {
+                dprintk("%s: server responded with empty layout array\n",
+                        __func__);
+                return -EINVAL;
+        }
+        p = xdr_inline_decode(xdr, 24);
+        if (unlikely(!p))
+                goto out_overflow;
+        p = xdr_decode_hyper(p, &res->range.offset);
+        p = xdr_decode_hyper(p, &res->range.length);
+        res->range.iomode = be32_to_cpup(p++);
+        res->type = be32_to_cpup(p++);
+        status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
+        if (unlikely(status))
+                return status;
+        dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
+                __func__,
+                (unsigned long)res->range.offset,
+                (unsigned long)res->range.length,
+                res->range.iomode,
+                res->type,
+                res->layout.len);
+        /* nfs4_proc_layoutget allocated a single page */
+        if (res->layout.len > PAGE_SIZE)
+                return -ENOMEM;
+        memcpy(res->layout.buf, p, res->layout.len);
+        if (layout_count > 1) {
+                /* We only handle a length one array at the moment.  Any
+                 * further entries are just ignored.  Note that this means
+                 * the client may see a response that is less than the
+                 * minimum it requested.
+                 */
+                dprintk("%s: server responded with %d layouts, dropping tail\n",
+                        __func__, layout_count);
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * END OF "GENERIC" DECODE ROUTINES.
 */
@@ -4738,26 +5012,26 @@ out_overflow:
 /*
 * Decode OPEN_DOWNGRADE response
 */
-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open_downgrade(&xdr, res);
+        status = decode_open_downgrade(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -4766,26 +5040,25 @@ out:
 /*
 * Decode ACCESS response
 */
-static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_accessres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_access(&xdr, res);
+        status = decode_access(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -4794,26 +5067,28 @@ out:
 /*
 * Decode LOOKUP response
 */
-static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_lookup_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_lookup(&xdr)) != 0)
+        status = decode_lookup(xdr);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) != 0)
+        status = decode_getfh(xdr, res->fh);
+        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server
+        status = decode_getfattr(xdr, res->fattr, res->server
                        ,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -4822,23 +5097,25 @@ out:
 /*
 * Decode LOOKUP_ROOT response
 */
-static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_lookup_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putrootfh(&xdr)) != 0)
+        status = decode_putrootfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) == 0)
+        status = decode_getfh(xdr, res->fh);
-                status = decode_getfattr(&xdr, res->fattr, res->server,
+        if (status == 0)
+                status = decode_getfattr(xdr, res->fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -4847,24 +5124,25 @@ out:
 /*
 * Decode REMOVE response
 */
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_removeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
+        status = decode_remove(xdr, &res->cinfo);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -4873,34 +5151,38 @@ out:
 /*
 * Decode RENAME response
 */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_renameres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
+        status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+        if (status)
                goto out;
        /* Current FH is target directory */
-        if (decode_getfattr(&xdr, res->new_fattr, res->server,
+        if (decode_getfattr(xdr, res->new_fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->old_fattr, res->server,
+        decode_getfattr(xdr, res->old_fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -4909,37 +5191,41 @@ out:
 /*
 * Decode LINK response
 */
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs4_link_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_link(&xdr, &res->cinfo)) != 0)
+        status = decode_link(xdr, &res->cinfo);
+        if (status)
                goto out;
        /*
         * Note order: OP_LINK leaves the directory as the current
         *             filehandle.
         */
-        if (decode_getfattr(&xdr, res->dir_attr, res->server,
+        if (decode_getfattr(xdr, res->dir_attr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -4948,33 +5234,37 @@ out:
 /*
 * Decode CREATE response
 */
-static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_create_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0)
+        status = decode_create(xdr, &res->dir_cinfo);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) != 0)
+        status = decode_getfh(xdr, res->fh);
+        if (status)
                goto out;
-        if (decode_getfattr(&xdr, res->fattr, res->server,
+        if (decode_getfattr(xdr, res->fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->dir_fattr, res->server,
+        decode_getfattr(xdr, res->dir_fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -4983,31 +5273,31 @@ out:
 /*
 * Decode SYMLINK response
 */
-static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_create_res *res)
 {
-        return nfs4_xdr_dec_create(rqstp, p, res);
+        return nfs4_xdr_dec_create(rqstp, xdr, res);
 }
 /*
 * Decode GETATTR response
 */
-static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_getattr_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server,
+        status = decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5016,46 +5306,40 @@ out:
 /*
 * Encode an SETACL request
 */
-static int
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
+                                struct nfs_setaclargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        int status;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_setacl(xdr, args, &hdr);
-        status = encode_setacl(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return status;
 }
 /*
 * Decode SETACL response
 */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                    struct nfs_setaclres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr);
+        status = decode_setattr(xdr);
 out:
        return status;
 }
@@ -5064,24 +5348,22 @@ out:
 * Decode GETACL response
 */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                    struct nfs_getaclres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getacl(&xdr, rqstp, &res->acl_len);
+        status = decode_getacl(xdr, rqstp, &res->acl_len);
 out:
        return status;
@@ -5090,23 +5372,22 @@ out:
 /*
 * Decode CLOSE response
 */
-static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_close(&xdr, res);
+        status = decode_close(xdr, res);
        if (status != 0)
                goto out;
        /*
@@ -5115,7 +5396,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
         *      an ESTALE error. Shouldn't be a problem,
         *      though, since fattr->valid will remain unset.
         */
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5124,36 +5405,35 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_savefh(&xdr);
+        status = decode_savefh(xdr);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(xdr, res);
        if (status)
                goto out;
-        if (decode_getfh(&xdr, &res->fh) != 0)
+        if (decode_getfh(xdr, &res->fh) != 0)
                goto out;
-        if (decode_getfattr(&xdr, res->f_attr, res->server,
+        if (decode_getfattr(xdr, res->f_attr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if (decode_restorefh(&xdr) != 0)
+        if (decode_restorefh(xdr) != 0)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5162,20 +5442,20 @@ out:
 /*
 * Decode OPEN_CONFIRM response
 */
-static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_open_confirmres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open_confirm(&xdr, res);
+        status = decode_open_confirm(xdr, res);
 out:
        return status;
 }
@@ -5183,26 +5463,26 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->f_attr, res->server,
+        decode_getfattr(xdr, res->f_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5211,26 +5491,26 @@ out:
 /*
 * Decode SETATTR response
 */
-static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+                                struct xdr_stream *xdr,
+                                struct nfs_setattrres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr);
+        status = decode_setattr(xdr);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5239,23 +5519,22 @@ out:
 /*
 * Decode LOCK response
 */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_lock_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_lock(&xdr, res);
+        status = decode_lock(xdr, res);
 out:
        return status;
 }
@@ -5263,23 +5542,22 @@ out:
 /*
 * Decode LOCKT response
 */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_lockt_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_lockt(&xdr, res);
+        status = decode_lockt(xdr, res);
 out:
        return status;
 }
@@ -5287,61 +5565,58 @@ out:
 /*
 * Decode LOCKU response
 */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_locku_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_locku(&xdr, res);
+        status = decode_locku(xdr, res);
 out:
        return status;
 }
-static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+                                          struct xdr_stream *xdr, void *dummy)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_release_lockowner(&xdr);
+                status = decode_release_lockowner(xdr);
        return status;
 }
 /*
 * Decode READLINK response
 */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+                                 struct xdr_stream *xdr,
                                 struct nfs4_readlink_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_readlink(&xdr, rqstp);
+        status = decode_readlink(xdr, rqstp);
 out:
        return status;
 }
@@ -5349,23 +5624,22 @@ out:
 /*
 * Decode READDIR response
 */
-static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_readdir_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_readdir(&xdr, rqstp, res);
+        status = decode_readdir(xdr, rqstp, res);
 out:
        return status;
 }
@@ -5373,23 +5647,22 @@ out:
 /*
 * Decode Read response
 */
-static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_readres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_read(&xdr, rqstp, res);
+        status = decode_read(xdr, rqstp, res);
        if (!status)
                status = res->count;
 out:
@@ -5399,26 +5672,25 @@ out:
 /*
 * Decode WRITE response
 */
-static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_writeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_write(&xdr, res);
+        status = decode_write(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
        if (!status)
                status = res->count;
@@ -5429,26 +5701,25 @@ out:
 /*
 * Decode COMMIT response
 */
-static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_writeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_commit(&xdr, res);
+        status = decode_commit(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5457,85 +5728,80 @@ out:
 /*
 * Decode FSINFO response
 */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
                               struct nfs4_fsinfo_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, res->fsinfo);
+                status = decode_fsinfo(xdr, res->fsinfo);
        return status;
 }
 /*
 * Decode PATHCONF response
 */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
                                 struct nfs4_pathconf_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_pathconf(&xdr, res->pathconf);
+                status = decode_pathconf(xdr, res->pathconf);
        return status;
 }
 /*
 * Decode STATFS response
 */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
                               struct nfs4_statfs_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_statfs(&xdr, res->fsstat);
+                status = decode_statfs(xdr, res->fsstat);
        return status;
 }
 /*
 * Decode GETATTR_BITMAP response
 */
-static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_server_caps_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, req);
+        status = decode_sequence(xdr, &res->seq_res, req);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        status = decode_server_caps(&xdr, res);
+        status = decode_server_caps(xdr, res);
 out:
        return status;
 }
@@ -5543,79 +5809,77 @@ out:
 /*
 * Decode RENEW response
 */
-static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              void *__unused)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_renew(&xdr);
+                status = decode_renew(xdr);
        return status;
 }
 /*
 * Decode SETCLIENTID response
 */
-static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
-                struct nfs4_setclientid_res *res)
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_setclientid_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid(&xdr, res);
+                status = decode_setclientid(xdr, res);
        return status;
 }
 /*
 * Decode SETCLIENTID_CONFIRM response
 */
-static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+                                            struct xdr_stream *xdr,
+                                            struct nfs_fsinfo *fsinfo)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid_confirm(&xdr);
+                status = decode_setclientid_confirm(xdr);
        if (!status)
-                status = decode_putrootfh(&xdr);
+                status = decode_putrootfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, fsinfo);
+                status = decode_fsinfo(xdr, fsinfo);
        return status;
 }
 /*
 * Decode DELEGRETURN response
 */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_delegreturnres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_delegreturn(&xdr);
+        status = decode_delegreturn(xdr);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5624,26 +5888,27 @@ out:
 /*
 * Decode FS_LOCATIONS response
 */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
                                     struct nfs4_fs_locations_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, req);
+        status = decode_sequence(xdr, &res->seq_res, req);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_lookup(&xdr)) != 0)
+        status = decode_lookup(xdr);
+        if (status)
                goto out;
-        xdr_enter_page(&xdr, PAGE_SIZE);
+        xdr_enter_page(xdr, PAGE_SIZE);
-        status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+        status = decode_getfattr(xdr, &res->fs_locations->fattr,
                                 res->fs_locations->server,
                                 !RPC_IS_ASYNC(req->rq_task));
 out:
@@ -5654,129 +5919,194 @@ out:
 /*
 * Decode EXCHANGE_ID response
 */
-static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
                                    void *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_exchange_id(&xdr, res);
+                status = decode_exchange_id(xdr, res);
        return status;
 }
 /*
 * Decode CREATE_SESSION response
 */
-static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
                                       struct nfs41_create_session_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_create_session(&xdr, res);
+                status = decode_create_session(xdr, res);
        return status;
 }
 /*
 * Decode DESTROY_SESSION response
 */
-static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
-                                        void *dummy)
+                                        struct xdr_stream *xdr,
+                                        void *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_destroy_session(&xdr, dummy);
+                status = decode_destroy_session(xdr, res);
        return status;
 }
 /*
 * Decode SEQUENCE response
 */
-static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+                                 struct xdr_stream *xdr,
                                 struct nfs4_sequence_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, res, rqstp);
+                status = decode_sequence(xdr, res, rqstp);
        return status;
 }
 /*
 * Decode GET_LEASE_TIME response
 */
-static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
                                       struct nfs4_get_lease_time_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+                status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
        if (!status)
-                status = decode_putrootfh(&xdr);
+                status = decode_putrootfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, res->lr_fsinfo);
+                status = decode_fsinfo(xdr, res->lr_fsinfo);
        return status;
 }
 /*
 * Decode RECLAIM_COMPLETE response
 */
-static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+                                         struct xdr_stream *xdr,
                                         struct nfs41_reclaim_complete_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, rqstp);
+                status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (!status)
-                status = decode_reclaim_complete(&xdr, (void *)NULL);
+                status = decode_reclaim_complete(xdr, (void *)NULL);
+        return status;
+}
+/*
+ * Decode GETDEVINFO response
+ */
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+                                      struct xdr_stream *xdr,
+                                      struct nfs4_getdeviceinfo_res *res)
+{
+        struct compound_hdr hdr;
+        int status;
+        status = decode_compound_hdr(xdr, &hdr);
+        if (status != 0)
+                goto out;
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
+        if (status != 0)
+                goto out;
+        status = decode_getdeviceinfo(xdr, res->pdev);
+out:
+        return status;
+}
+/*
+ * Decode LAYOUTGET response
+ */
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
+                                  struct nfs4_layoutget_res *res)
+{
+        struct compound_hdr hdr;
+        int status;
+        status = decode_compound_hdr(xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
+        status = decode_putfh(xdr);
+        if (status)
+                goto out;
+        status = decode_layoutget(xdr, rqstp, res);
+out:
        return status;
 }
 #endif /* CONFIG_NFS_V4_1 */
-__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+/**
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
 {
        uint32_t bitmap[2] = {0};
        uint32_t len;
+        __be32 *p = xdr_inline_decode(xdr, 4);
-        if (!*p++) {
+        if (unlikely(!p))
-                if (!*p)
+                goto out_overflow;
-                        return ERR_PTR(-EAGAIN);
+        if (*p == xdr_zero) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (*p == xdr_zero)
+                        return -EAGAIN;
                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
+                return -EBADCOOKIE;
        }
+        p = xdr_inline_decode(xdr, 12);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->prev_cookie = entry->cookie;
        p = xdr_decode_hyper(p, &entry->cookie);
-        entry->len = ntohl(*p++);
+        entry->len = be32_to_cpup(p);
+        p = xdr_inline_decode(xdr, entry->len);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->name = (const char *) p;
-        p += XDR_QUADLEN(entry->len);
        /*
         * In case the server doesn't return an inode number,
@@ -5784,32 +6114,29 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
         * since glibc seems to choke on it...)
         */
        entry->ino = 1;
+        entry->fattr->valid = 0;
-        len = ntohl(*p++);              /* bitmap length */
+        if (decode_attr_bitmap(xdr, bitmap) < 0)
-        if (len-- > 0) {
+                goto out_overflow;
-                bitmap[0] = ntohl(*p++);
-                if (len-- > 0) {
-                        bitmap[1] = ntohl(*p++);
-                        p += len;
-                }
-        }
-        len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */
-        if (len > 0) {
-                if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) {
-                        bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
-                        /* Ignore the return value of rdattr_error for now */
-                        p++;
-                        len--;
-                }
-                if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID)
-                        xdr_decode_hyper(p, &entry->ino);
-                else if (bitmap[0] == FATTR4_WORD0_FILEID)
-                        xdr_decode_hyper(p, &entry->ino);
-                p += len;
-        }
-        entry->eof = !p[0] && p[1];
+        if (decode_attr_length(xdr, &len, &p) < 0)
-        return p;
+                goto out_overflow;
+        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+                                        entry->server, 1) < 0)
+                goto out_overflow;
+        if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
+                entry->ino = entry->fattr->fileid;
+        entry->d_type = DT_UNKNOWN;
+        if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
+                entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EAGAIN;
 }
 /*
@@ -5885,8 +6212,8 @@ nfs4_stat_to_errno(int stat)
 #define PROC(proc, argtype, restype)                            \
 [NFSPROC4_CLNT_##proc] = {                                      \
        .p_proc   = NFSPROC4_COMPOUND,                          \
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,            \
+        .p_encode = (kxdreproc_t)nfs4_xdr_##argtype,            \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,            \
+        .p_decode = (kxdrdproc_t)nfs4_xdr_##restype,            \
        .p_arglen = NFS4_##argtype##_sz,                        \
        .p_replen = NFS4_##restype##_sz,                        \
        .p_statidx = NFSPROC4_CLNT_##proc,                      \
@@ -5894,48 +6221,50 @@ nfs4_stat_to_errno(int stat)
 }
 struct rpc_procinfo     nfs4_procedures[] = {
-  PROC(READ,            enc_read,       dec_read),
+        PROC(READ,              enc_read,               dec_read),
-  PROC(WRITE,           enc_write,      dec_write),
+        PROC(WRITE,             enc_write,              dec_write),
-  PROC(COMMIT,          enc_commit,     dec_commit),
+        PROC(COMMIT,            enc_commit,             dec_commit),
-  PROC(OPEN,            enc_open,       dec_open),
+        PROC(OPEN,              enc_open,               dec_open),
-  PROC(OPEN_CONFIRM,    enc_open_confirm,       dec_open_confirm),
+        PROC(OPEN_CONFIRM,      enc_open_confirm,       dec_open_confirm),
-  PROC(OPEN_NOATTR,     enc_open_noattr,        dec_open_noattr),
+        PROC(OPEN_NOATTR,       enc_open_noattr,        dec_open_noattr),
-  PROC(OPEN_DOWNGRADE,  enc_open_downgrade,     dec_open_downgrade),
+        PROC(OPEN_DOWNGRADE,    enc_open_downgrade,     dec_open_downgrade),
-  PROC(CLOSE,           enc_close,      dec_close),
+        PROC(CLOSE,             enc_close,              dec_close),
-  PROC(SETATTR,         enc_setattr,    dec_setattr),
+        PROC(SETATTR,           enc_setattr,            dec_setattr),
-  PROC(FSINFO,          enc_fsinfo,     dec_fsinfo),
+        PROC(FSINFO,            enc_fsinfo,             dec_fsinfo),
-  PROC(RENEW,           enc_renew,      dec_renew),
+        PROC(RENEW,             enc_renew,              dec_renew),
-  PROC(SETCLIENTID,     enc_setclientid,        dec_setclientid),
+        PROC(SETCLIENTID,       enc_setclientid,        dec_setclientid),
-  PROC(SETCLIENTID_CONFIRM,     enc_setclientid_confirm,        dec_setclientid_confirm),
+        PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
-  PROC(LOCK,            enc_lock,       dec_lock),
+        PROC(LOCK,              enc_lock,               dec_lock),
-  PROC(LOCKT,           enc_lockt,      dec_lockt),
+        PROC(LOCKT,             enc_lockt,              dec_lockt),
-  PROC(LOCKU,           enc_locku,      dec_locku),
+        PROC(LOCKU,             enc_locku,              dec_locku),
-  PROC(ACCESS,          enc_access,     dec_access),
+        PROC(ACCESS,            enc_access,             dec_access),
-  PROC(GETATTR,         enc_getattr,    dec_getattr),
+        PROC(GETATTR,           enc_getattr,            dec_getattr),
-  PROC(LOOKUP,          enc_lookup,     dec_lookup),
+        PROC(LOOKUP,            enc_lookup,             dec_lookup),
-  PROC(LOOKUP_ROOT,     enc_lookup_root,        dec_lookup_root),
+        PROC(LOOKUP_ROOT,       enc_lookup_root,        dec_lookup_root),
-  PROC(REMOVE,          enc_remove,     dec_remove),
+        PROC(REMOVE,            enc_remove,             dec_remove),
-  PROC(RENAME,          enc_rename,     dec_rename),
+        PROC(RENAME,            enc_rename,             dec_rename),
-  PROC(LINK,            enc_link,       dec_link),
+        PROC(LINK,              enc_link,               dec_link),
-  PROC(SYMLINK,         enc_symlink,    dec_symlink),
+        PROC(SYMLINK,           enc_symlink,            dec_symlink),
-  PROC(CREATE,          enc_create,     dec_create),
+        PROC(CREATE,            enc_create,             dec_create),
-  PROC(PATHCONF,        enc_pathconf,   dec_pathconf),
+        PROC(PATHCONF,          enc_pathconf,           dec_pathconf),
-  PROC(STATFS,          enc_statfs,     dec_statfs),
+        PROC(STATFS,            enc_statfs,             dec_statfs),
-  PROC(READLINK,        enc_readlink,   dec_readlink),
+        PROC(READLINK,          enc_readlink,           dec_readlink),
-  PROC(READDIR,         enc_readdir,    dec_readdir),
+        PROC(READDIR,           enc_readdir,            dec_readdir),
-  PROC(SERVER_CAPS,     enc_server_caps, dec_server_caps),
+        PROC(SERVER_CAPS,       enc_server_caps,        dec_server_caps),
-  PROC(DELEGRETURN,     enc_delegreturn, dec_delegreturn),
+        PROC(DELEGRETURN,       enc_delegreturn,        dec_delegreturn),
-  PROC(GETACL,          enc_getacl,     dec_getacl),
+        PROC(GETACL,            enc_getacl,             dec_getacl),
-  PROC(SETACL,          enc_setacl,     dec_setacl),
+        PROC(SETACL,            enc_setacl,             dec_setacl),
-  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
+        PROC(FS_LOCATIONS,      enc_fs_locations,       dec_fs_locations),
-  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
+        PROC(RELEASE_LOCKOWNER, enc_release_lockowner,  dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
-  PROC(EXCHANGE_ID,     enc_exchange_id,        dec_exchange_id),
+        PROC(EXCHANGE_ID,       enc_exchange_id,        dec_exchange_id),
-  PROC(CREATE_SESSION,  enc_create_session,     dec_create_session),
+        PROC(CREATE_SESSION,    enc_create_session,     dec_create_session),
-  PROC(DESTROY_SESSION, enc_destroy_session,    dec_destroy_session),
+        PROC(DESTROY_SESSION,   enc_destroy_session,    dec_destroy_session),
-  PROC(SEQUENCE,        enc_sequence,   dec_sequence),
+        PROC(SEQUENCE,          enc_sequence,           dec_sequence),
-  PROC(GET_LEASE_TIME,  enc_get_lease_time,     dec_get_lease_time),
+        PROC(GET_LEASE_TIME,    enc_get_lease_time,     dec_get_lease_time),
-  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
+        PROC(RECLAIM_COMPLETE,  enc_reclaim_complete,   dec_reclaim_complete),
+        PROC(GETDEVICEINFO,     enc_getdeviceinfo,      dec_getdeviceinfo),
+        PROC(LAYOUTGET,         enc_layoutget,          dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index df101d9f546a..903908a20023 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -3,9 +3,10 @@
 *
 *  Allow an NFS filesystem to be mounted as root. The way this works is:
 *     (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
- *     (2) Handle RPC negotiation with the system which replied to RARP or
+ *     (2) Construct the device string and the options string using DHCP
- *         was reported as a boot server by BOOTP or manually.
+ *         option 17 and/or kernel command line options.
- *     (3) The actual mounting is done later, when init() is running.
+ *     (3) When mount_root() sets up the root file system, pass these strings
+ *         to the NFS client's regular mount interface via sys_mount().
 *
 *
 *      Changes:
@@ -65,470 +66,245 @@
 *      Hua Qin         :       Support for mounting root file system via
 *                              NFS over TCP.
 *      Fabian Frederick:       Option parser rebuilt (using parser lib)
-*/
+ *      Chuck Lever     :       Use super.c's text-based mount option parsing
+ *      Chuck Lever     :       Add "nfsrootdebug".
+ */
 #include <linux/types.h>
 #include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/fs.h>
 #include <linux/init.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xprtsock.h>
 #include <linux/nfs.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs_mount.h>
-#include <linux/in.h>
-#include <linux/major.h>
 #include <linux/utsname.h>
-#include <linux/inet.h>
 #include <linux/root_dev.h>
 #include <net/ipconfig.h>
-#include <linux/parser.h>
 #include "internal.h"
-/* Define this to allow debugging output */
-#undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
-/* Default port to use if server is not running a portmapper */
-#define NFS_MNT_PORT    627
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT                "/tftpboot/%s"
 /* Parameters passed from the kernel command line */
-static char nfs_root_name[256] __initdata = "";
+static char nfs_root_parms[256] __initdata = "";
+/* Text-based mount options passed to super.c */
+static char nfs_root_options[256] __initdata = "";
 /* Address of NFS server */
-static __be32 servaddr __initdata = 0;
+static __be32 servaddr __initdata = htonl(INADDR_NONE);
 /* Name of directory to mount */
-static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
-/* NFS-related data */
-static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
-static int nfs_port __initdata = 0;             /* Port to connect to for NFS */
-static int mount_port __initdata = 0;           /* Mount daemon port number */
-/***************************************************************************
-                             Parsing of options
- ***************************************************************************/
-enum {
-        /* Options that take integer arguments */
-        Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin,
-        Opt_acregmax, Opt_acdirmin, Opt_acdirmax,
-        /* Options that take no arguments */
-        Opt_soft, Opt_hard, Opt_intr,
-        Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, 
-        Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp,
-        Opt_acl, Opt_noacl,
-        /* Error token */
-        Opt_err
-};
-static const match_table_t tokens __initconst = {
-        {Opt_port, "port=%u"},
-        {Opt_rsize, "rsize=%u"},
-        {Opt_wsize, "wsize=%u"},
-        {Opt_timeo, "timeo=%u"},
-        {Opt_retrans, "retrans=%u"},
-        {Opt_acregmin, "acregmin=%u"},
-        {Opt_acregmax, "acregmax=%u"},
-        {Opt_acdirmin, "acdirmin=%u"},
-        {Opt_acdirmax, "acdirmax=%u"},
-        {Opt_soft, "soft"},
-        {Opt_hard, "hard"},
-        {Opt_intr, "intr"},
-        {Opt_nointr, "nointr"},
-        {Opt_posix, "posix"},
-        {Opt_noposix, "noposix"},
-        {Opt_cto, "cto"},
-        {Opt_nocto, "nocto"},
-        {Opt_ac, "ac"},
-        {Opt_noac, "noac"},
-        {Opt_lock, "lock"},
-        {Opt_nolock, "nolock"},
-        {Opt_v2, "nfsvers=2"},
-        {Opt_v2, "v2"},
-        {Opt_v3, "nfsvers=3"},
-        {Opt_v3, "v3"},
-        {Opt_udp, "proto=udp"},
-        {Opt_udp, "udp"},
-        {Opt_tcp, "proto=tcp"},
-        {Opt_tcp, "tcp"},
-        {Opt_acl, "acl"},
-        {Opt_noacl, "noacl"},
-        {Opt_err, NULL}
-        
-};
+/* server:export path string passed to super.c */
+static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
+#ifdef RPC_DEBUG
 /*
- *  Parse option string.
+ * When the "nfsrootdebug" kernel command line option is specified,
+ * enable debugging messages for NFSROOT.
 */
+static int __init nfs_root_debug(char *__unused)
-static int __init root_nfs_parse(char *name, char *buf)
 {
+        nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
-        char *p;
-        substring_t args[MAX_OPT_ARGS];
-        int option;
-        if (!name)
-                return 1;
-        /* Set the NFS remote path */
-        p = strsep(&name, ",");
-        if (p[0] != '\0' && strcmp(p, "default") != 0)
-                strlcpy(buf, p, NFS_MAXPATHLEN);
-        while ((p = strsep (&name, ",")) != NULL) {
-                int token; 
-                if (!*p)
-                        continue;
-                token = match_token(p, tokens, args);
-                /* %u tokens only. Beware if you add new tokens! */
-                if (token < Opt_soft && match_int(&args[0], &option))
-                        return 0;
-                switch (token) {
-                        case Opt_port:
-                                nfs_port = option;
-                                break;
-                        case Opt_rsize:
-                                nfs_data.rsize = option;
-                                break;
-                        case Opt_wsize:
-                                nfs_data.wsize = option;
-                                break;
-                        case Opt_timeo:
-                                nfs_data.timeo = option;
-                                break;
-                        case Opt_retrans:
-                                nfs_data.retrans = option;
-                                break;
-                        case Opt_acregmin:
-                                nfs_data.acregmin = option;
-                                break;
-                        case Opt_acregmax:
-                                nfs_data.acregmax = option;
-                                break;
-                        case Opt_acdirmin:
-                                nfs_data.acdirmin = option;
-                                break;
-                        case Opt_acdirmax:
-                                nfs_data.acdirmax = option;
-                                break;
-                        case Opt_soft:
-                                nfs_data.flags |= NFS_MOUNT_SOFT;
-                                break;
-                        case Opt_hard:
-                                nfs_data.flags &= ~NFS_MOUNT_SOFT;
-                                break;
-                        case Opt_intr:
-                        case Opt_nointr:
-                                break;
-                        case Opt_posix:
-                                nfs_data.flags |= NFS_MOUNT_POSIX;
-                                break;
-                        case Opt_noposix:
-                                nfs_data.flags &= ~NFS_MOUNT_POSIX;
-                                break;
-                        case Opt_cto:
-                                nfs_data.flags &= ~NFS_MOUNT_NOCTO;
-                                break;
-                        case Opt_nocto:
-                                nfs_data.flags |= NFS_MOUNT_NOCTO;
-                                break;
-                        case Opt_ac:
-                                nfs_data.flags &= ~NFS_MOUNT_NOAC;
-                                break;
-                        case Opt_noac:
-                                nfs_data.flags |= NFS_MOUNT_NOAC;
-                                break;
-                        case Opt_lock:
-                                nfs_data.flags &= ~NFS_MOUNT_NONLM;
-                                break;
-                        case Opt_nolock:
-                                nfs_data.flags |= NFS_MOUNT_NONLM;
-                                break;
-                        case Opt_v2:
-                                nfs_data.flags &= ~NFS_MOUNT_VER3;
-                                break;
-                        case Opt_v3:
-                                nfs_data.flags |= NFS_MOUNT_VER3;
-                                break;
-                        case Opt_udp:
-                                nfs_data.flags &= ~NFS_MOUNT_TCP;
-                                break;
-                        case Opt_tcp:
-                                nfs_data.flags |= NFS_MOUNT_TCP;
-                                break;
-                        case Opt_acl:
-                                nfs_data.flags &= ~NFS_MOUNT_NOACL;
-                                break;
-                        case Opt_noacl:
-                                nfs_data.flags |= NFS_MOUNT_NOACL;
-                                break;
-                        default:
-                                printk(KERN_WARNING "Root-NFS: unknown "
-                                        "option: %s\n", p);
-                                return 0;
-                }
-        }
        return 1;
 }
+__setup("nfsrootdebug", nfs_root_debug);
+#endif
 /*
- *  Prepare the NFS data structure and parse all options.
+ *  Parse NFS server and directory information passed on the kernel
+ *  command line.
+ *
+ *  nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+ *
+ *  If there is a "%s" token in the <root-dir> string, it is replaced
+ *  by the ASCII-representation of the client's IP address.
 */
-static int __init root_nfs_name(char *name)
+static int __init nfs_root_setup(char *line)
 {
-        static char buf[NFS_MAXPATHLEN] __initdata;
+        ROOT_DEV = Root_NFS;
-        char *cp;
+        if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
-        /* Set some default values */
+                strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
-        memset(&nfs_data, 0, sizeof(nfs_data));
+        } else {
-        nfs_port          = -1;
+                size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
-        nfs_data.version  = NFS_MOUNT_VERSION;
+                if (n >= sizeof(nfs_root_parms))
-        nfs_data.flags    = NFS_MOUNT_NONLM;    /* No lockd in nfs root yet */
+                        line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
-        nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
+                sprintf(nfs_root_parms, NFS_ROOT, line);
-        nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
-        nfs_data.acregmin = NFS_DEF_ACREGMIN;
-        nfs_data.acregmax = NFS_DEF_ACREGMAX;
-        nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
-        nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
-        strcpy(buf, NFS_ROOT);
-        /* Process options received from the remote server */
-        root_nfs_parse(root_server_path, buf);
-        /* Override them by options set on kernel command-line */
-        root_nfs_parse(name, buf);
-        cp = utsname()->nodename;
-        if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
-                printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
-                return -1;
        }
-        sprintf(nfs_export_path, buf, cp);
+        /*
+         * Extract the IP address of the NFS server containing our
+         * root file system, if one was specified.
+         *
+         * Note: root_nfs_parse_addr() removes the server-ip from
+         *       nfs_root_parms, if it exists.
+         */
+        root_server_addr = root_nfs_parse_addr(nfs_root_parms);
        return 1;
 }
+__setup("nfsroot=", nfs_root_setup);
-/*
+static int __init root_nfs_copy(char *dest, const char *src,
- *  Get NFS server address.
+                                     const size_t destlen)
- */
-static int __init root_nfs_addr(void)
 {
-        if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) {
+        if (strlcpy(dest, src, destlen) > destlen)
-                printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n");
                return -1;
-        }
+        return 0;
+}
-        snprintf(nfs_data.hostname, sizeof(nfs_data.hostname),
+static int __init root_nfs_cat(char *dest, const char *src,
-                 "%pI4", &servaddr);
+                                  const size_t destlen)
+{
+        if (strlcat(dest, src, destlen) > destlen)
+                return -1;
        return 0;
 }
 /*
- *  Tell the user what's going on.
+ * Parse out root export path and mount options from
+ * passed-in string @incoming.
+ *
+ * Copy the export path into @exppath.
 */
-#ifdef NFSROOT_DEBUG
+static int __init root_nfs_parse_options(char *incoming, char *exppath,
-static void __init root_nfs_print(void)
+                                         const size_t exppathlen)
 {
-        printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n",
+        char *p;
-                nfs_export_path, nfs_data.hostname);
-        printk(KERN_NOTICE "Root-NFS:     rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
-                nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
-        printk(KERN_NOTICE "Root-NFS:     acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
-                nfs_data.acregmin, nfs_data.acregmax,
-                nfs_data.acdirmin, nfs_data.acdirmax);
-        printk(KERN_NOTICE "Root-NFS:     nfsd port = %d, mountd port = %d, flags = %08x\n",
-                nfs_port, mount_port, nfs_data.flags);
-}
-#endif
-static int __init root_nfs_init(void)
+        /*
-{
+         * Set the NFS remote path
-#ifdef NFSROOT_DEBUG
+         */
-        nfs_debug |= NFSDBG_ROOT;
+        p = strsep(&incoming, ",");
-#endif
+        if (*p != '\0' && strcmp(p, "default") != 0)
+                if (root_nfs_copy(exppath, p, exppathlen))
+                        return -1;
        /*
-         * Decode the root directory path name and NFS options from
+         * @incoming now points to the rest of the string; if it
-         * the kernel command line. This has to go here in order to
+         * contains something, append it to our root options buffer
-         * be able to use the client IP address for the remote root
-         * directory (necessary for pure RARP booting).
         */
-        if (root_nfs_name(nfs_root_name) < 0 ||
+        if (incoming != NULL && *incoming != '\0')
-            root_nfs_addr() < 0)
+                if (root_nfs_cat(nfs_root_options, incoming,
-                return -1;
+                                                sizeof(nfs_root_options)))
+                        return -1;
-#ifdef NFSROOT_DEBUG
+        /*
-        root_nfs_print();
+         * Possibly prepare for more options to be appended
-#endif
+         */
+        if (nfs_root_options[0] != '\0' &&
+            nfs_root_options[strlen(nfs_root_options)] != ',')
+                if (root_nfs_cat(nfs_root_options, ",",
+                                                sizeof(nfs_root_options)))
+                        return -1;
        return 0;
 }
 /*
- *  Parse NFS server and directory information passed on the kernel
+ *  Decode the export directory path name and NFS options from
- *  command line.
+ *  the kernel command line.  This has to be done late in order to
+ *  use a dynamically acquired client IP address for the remote
+ *  root directory path.
+ *
+ *  Returns zero if successful; otherwise -1 is returned.
 */
-static int __init nfs_root_setup(char *line)
+static int __init root_nfs_data(char *cmdline)
 {
-        ROOT_DEV = Root_NFS;
+        char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
-        if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
+        int len, retval = -1;
-                strlcpy(nfs_root_name, line, sizeof(nfs_root_name));
+        char *tmp = NULL;
-        } else {
+        const size_t tmplen = sizeof(nfs_export_path);
-                int n = strlen(line) + sizeof(NFS_ROOT) - 1;
-                if (n >= sizeof(nfs_root_name))
+        tmp = kzalloc(tmplen, GFP_KERNEL);
-                        line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0';
+        if (tmp == NULL)
-                sprintf(nfs_root_name, NFS_ROOT, line);
+                goto out_nomem;
+        strcpy(tmp, NFS_ROOT);
+        if (root_server_path[0] != '\0') {
+                dprintk("Root-NFS: DHCPv4 option 17: %s\n",
+                        root_server_path);
+                if (root_nfs_parse_options(root_server_path, tmp, tmplen))
+                        goto out_optionstoolong;
        }
-        root_server_addr = root_nfs_parse_addr(nfs_root_name);
-        return 1;
-}
-__setup("nfsroot=", nfs_root_setup);
-/***************************************************************************
-               Routines to actually mount the root directory
+        if (cmdline[0] != '\0') {
+                dprintk("Root-NFS: nfsroot=%s\n", cmdline);
+                if (root_nfs_parse_options(cmdline, tmp, tmplen))
+                        goto out_optionstoolong;
+        }
- ***************************************************************************/
+        /*
+         * Append mandatory options for nfsroot so they override
+         * what has come before
+         */
+        snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+                        &servaddr);
+        if (root_nfs_cat(nfs_root_options, addr_option,
+                                                sizeof(nfs_root_options)))
+                goto out_optionstoolong;
-/*
+        /*
- *  Construct sockaddr_in from address and port number.
+         * Set up nfs_root_device.  For NFS mounts, this looks like
- */
+         *
-static inline void
+         *      server:/path
-set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
+         *
-{
+         * At this point, utsname()->nodename contains our local
-        sin->sin_family = AF_INET;
+         * IP address or hostname, set by ipconfig.  If "%s" exists
-        sin->sin_addr.s_addr = addr;
+         * in tmp, substitute the nodename, then shovel the whole
-        sin->sin_port = port;
+         * mess into nfs_root_device.
-}
+         */
+        len = snprintf(nfs_export_path, sizeof(nfs_export_path),
+                                tmp, utsname()->nodename);
+        if (len > (int)sizeof(nfs_export_path))
+                goto out_devnametoolong;
+        len = snprintf(nfs_root_device, sizeof(nfs_root_device),
+                                "%pI4:%s", &servaddr, nfs_export_path);
+        if (len > (int)sizeof(nfs_root_device))
+                goto out_devnametoolong;
-/*
+        retval = 0;
- *  Query server portmapper for the port of a daemon program.
- */
-static int __init root_nfs_getport(int program, int version, int proto)
-{
-        struct sockaddr_in sin;
-        printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n",
+out:
-                program, version, &servaddr);
+        kfree(tmp);
-        set_sockaddr(&sin, servaddr, 0);
+        return retval;
-        return rpcb_getport_sync(&sin, program, version, proto);
+out_nomem:
+        printk(KERN_ERR "Root-NFS: could not allocate memory\n");
+        goto out;
+out_optionstoolong:
+        printk(KERN_ERR "Root-NFS: mount options string too long\n");
+        goto out;
+out_devnametoolong:
+        printk(KERN_ERR "Root-NFS: root device name too long.\n");
+        goto out;
 }
+/**
-/*
+ * nfs_root_data - Return prepared 'data' for NFSROOT mount
- *  Use portmapper to find mountd and nfsd port numbers if not overriden
+ * @root_device: OUT: address of string containing NFSROOT device
- *  by the user. Use defaults if portmapper is not available.
+ * @root_data: OUT: address of string containing NFSROOT mount options
- *  XXX: Is there any nfs server with no portmapper?
+ *
+ * Returns zero and sets @root_device and @root_data if successful,
+ * otherwise -1 is returned.
 */
-static int __init root_nfs_ports(void)
+int __init nfs_root_data(char **root_device, char **root_data)
 {
-        int port;
+        servaddr = root_server_addr;
-        int nfsd_ver, mountd_ver;
+        if (servaddr == htonl(INADDR_NONE)) {
-        int nfsd_port, mountd_port;
+                printk(KERN_ERR "Root-NFS: no NFS server address\n");
-        int proto;
+                return -1;
-        if (nfs_data.flags & NFS_MOUNT_VER3) {
-                nfsd_ver = NFS3_VERSION;
-                mountd_ver = NFS_MNT3_VERSION;
-                nfsd_port = NFS_PORT;
-                mountd_port = NFS_MNT_PORT;
-        } else {
-                nfsd_ver = NFS2_VERSION;
-                mountd_ver = NFS_MNT_VERSION;
-                nfsd_port = NFS_PORT;
-                mountd_port = NFS_MNT_PORT;
-        }
-        proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-        if (nfs_port < 0) {
-                if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) {
-                        printk(KERN_ERR "Root-NFS: Unable to get nfsd port "
-                                        "number from server, using default\n");
-                        port = nfsd_port;
-                }
-                nfs_port = port;
-                dprintk("Root-NFS: Portmapper on server returned %d "
-                        "as nfsd port\n", port);
        }
-        if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) {
+        if (root_nfs_data(nfs_root_parms) < 0)
-                printk(KERN_ERR "Root-NFS: Unable to get mountd port "
+                return -1;
-                                "number from server, using default\n");
-                port = mountd_port;
-        }
-        mount_port = port;
-        dprintk("Root-NFS: mountd port is %d\n", port);
+        *root_device = nfs_root_device;
+        *root_data = nfs_root_options;
        return 0;
 }
-/*
- *  Get a file handle from the server for the directory which is to be
- *  mounted.
- */
-static int __init root_nfs_get_handle(void)
-{
-        struct sockaddr_in sin;
-        unsigned int auth_flav_len = 0;
-        struct nfs_mount_request request = {
-                .sap            = (struct sockaddr *)&sin,
-                .salen          = sizeof(sin),
-                .dirpath        = nfs_export_path,
-                .version        = (nfs_data.flags & NFS_MOUNT_VER3) ?
-                                        NFS_MNT3_VERSION : NFS_MNT_VERSION,
-                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
-                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
-                .auth_flav_len  = &auth_flav_len,
-        };
-        int status = -ENOMEM;
-        request.fh = nfs_alloc_fhandle();
-        if (!request.fh)
-                goto out;
-        set_sockaddr(&sin, servaddr, htons(mount_port));
-        status = nfs_mount(&request);
-        if (status < 0)
-                printk(KERN_ERR "Root-NFS: Server returned error %d "
-                                "while mounting %s\n", status, nfs_export_path);
-        else {
-                nfs_data.root.size = request.fh->size;
-                memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
-        }
-        nfs_free_fhandle(request.fh);
-out:
-        return status;
-}
-/*
- *  Get the NFS port numbers and file handle, and return the prepared 'data'
- *  argument for mount() if everything went OK. Return NULL otherwise.
- */
-void * __init nfs_root_data(void)
-{
-        if (root_nfs_init() < 0
-         || root_nfs_ports() < 0
-         || root_nfs_get_handle() < 0)
-                return NULL;
-        set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port));
-        return (void*)&nfs_data;
-}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 919490232e17..e1164e3f9e69 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,12 +26,9 @@ static struct kmem_cache *nfs_page_cachep;
 static inline struct nfs_page *
 nfs_page_alloc(void)
 {
-        struct nfs_page *p;
+        struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
-        p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL);
+        if (p)
-        if (p) {
-                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->wb_list);
-        }
        return p;
 }
@@ -65,6 +62,13 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
        if (req == NULL)
                return ERR_PTR(-ENOMEM);
+        /* get lock context early so we can deal with alloc failures */
+        req->wb_lock_context = nfs_get_lock_context(ctx);
+        if (req->wb_lock_context == NULL) {
+                nfs_page_free(req);
+                return ERR_PTR(-ENOMEM);
+        }
        /* Initialize the request struct. Initially, we assume a
         * long write-back delay. This will be adjusted in
         * update_nfs_request below if the region is not locked. */
@@ -79,7 +83,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
        req->wb_pgbase  = offset;
        req->wb_bytes   = count;
        req->wb_context = get_nfs_open_context(ctx);
-        req->wb_lock_context = nfs_get_lock_context(ctx);
        kref_init(&req->wb_kref);
        return req;
 }
@@ -109,7 +112,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 {
        if (!nfs_lock_request_dontget(req))
                return 0;
-        if (req->wb_page != NULL)
+        if (test_bit(PG_MAPPED, &req->wb_flags))
                radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
        return 1;
 }
@@ -119,7 +122,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 */
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
-        if (req->wb_page != NULL) {
+        if (test_bit(PG_MAPPED, &req->wb_flags)) {
                struct inode *inode = req->wb_context->path.dentry->d_inode;
                struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 000000000000..1b1bc1a0fb0a
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,965 @@
+/*
+ *  pNFS functions to call and manage layout drivers.
+ *
+ *  Copyright (c) 2002 [year of first publication]
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "pnfs.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS
+/* Locking:
+ *
+ * pnfs_spinlock:
+ *      protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+        struct pnfs_layoutdriver_type *local;
+        list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+                if (local->id == id)
+                        goto out;
+        local = NULL;
+out:
+        dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+        return local;
+}
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver(u32 id)
+{
+        struct pnfs_layoutdriver_type *local;
+        spin_lock(&pnfs_spinlock);
+        local = find_pnfs_driver_locked(id);
+        spin_unlock(&pnfs_spinlock);
+        return local;
+}
+void
+unset_pnfs_layoutdriver(struct nfs_server *nfss)
+{
+        if (nfss->pnfs_curr_ld) {
+                nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+                module_put(nfss->pnfs_curr_ld->owner);
+        }
+        nfss->pnfs_curr_ld = NULL;
+}
+/*
+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
+ * Currently only one pNFS layout driver per filesystem is supported.
+ *
+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ */
+void
+set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+{
+        struct pnfs_layoutdriver_type *ld_type = NULL;
+        if (id == 0)
+                goto out_no_driver;
+        if (!(server->nfs_client->cl_exchange_flags &
+                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
+                printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
+                       id, server->nfs_client->cl_exchange_flags);
+                goto out_no_driver;
+        }
+        ld_type = find_pnfs_driver(id);
+        if (!ld_type) {
+                request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+                ld_type = find_pnfs_driver(id);
+                if (!ld_type) {
+                        dprintk("%s: No pNFS module found for %u.\n",
+                                __func__, id);
+                        goto out_no_driver;
+                }
+        }
+        if (!try_module_get(ld_type->owner)) {
+                dprintk("%s: Could not grab reference on module\n", __func__);
+                goto out_no_driver;
+        }
+        server->pnfs_curr_ld = ld_type;
+        if (ld_type->set_layoutdriver(server)) {
+                printk(KERN_ERR
+                       "%s: Error initializing mount point for layout driver %u.\n",
+                       __func__, id);
+                module_put(ld_type->owner);
+                goto out_no_driver;
+        }
+        dprintk("%s: pNFS module for %u set\n", __func__, id);
+        return;
+out_no_driver:
+        dprintk("%s: Using NFSv4 I/O\n", __func__);
+        server->pnfs_curr_ld = NULL;
+}
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+        int status = -EINVAL;
+        struct pnfs_layoutdriver_type *tmp;
+        if (ld_type->id == 0) {
+                printk(KERN_ERR "%s id 0 is reserved\n", __func__);
+                return status;
+        }
+        if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
+                printk(KERN_ERR "%s Layout driver must provide "
+                       "alloc_lseg and free_lseg.\n", __func__);
+                return status;
+        }
+        spin_lock(&pnfs_spinlock);
+        tmp = find_pnfs_driver_locked(ld_type->id);
+        if (!tmp) {
+                list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+                status = 0;
+                dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+                        ld_type->name);
+        } else {
+                printk(KERN_ERR "%s Module with id %d already loaded!\n",
+                        __func__, ld_type->id);
+        }
+        spin_unlock(&pnfs_spinlock);
+        return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+        dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+        spin_lock(&pnfs_spinlock);
+        list_del(&ld_type->pnfs_tblid);
+        spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
+/*
+ * pNFS client layout cache
+ */
+/* Need to hold i_lock if caller does not already hold reference */
+void
+get_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        atomic_inc(&lo->plh_refcount);
+}
+static void
+destroy_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        dprintk("%s: freeing layout cache %p\n", __func__, lo);
+        BUG_ON(!list_empty(&lo->plh_layouts));
+        NFS_I(lo->plh_inode)->layout = NULL;
+        kfree(lo);
+}
+static void
+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+        if (atomic_dec_and_test(&lo->plh_refcount))
+                destroy_layout_hdr(lo);
+}
+void
+put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        struct inode *inode = lo->plh_inode;
+        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+                destroy_layout_hdr(lo);
+                spin_unlock(&inode->i_lock);
+        }
+}
+static void
+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+{
+        INIT_LIST_HEAD(&lseg->pls_list);
+        atomic_set(&lseg->pls_refcount, 1);
+        smp_mb();
+        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+        lseg->pls_layout = lo;
+}
+static void free_lseg(struct pnfs_layout_segment *lseg)
+{
+        struct inode *ino = lseg->pls_layout->plh_inode;
+        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+        /* Matched by get_layout_hdr in pnfs_insert_layout */
+        put_layout_hdr(NFS_I(ino)->layout);
+}
+/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
+ * could sleep, so must be called outside of the lock.
+ * Returns 1 if object was removed, otherwise return 0.
+ */
+static int
+put_lseg_locked(struct pnfs_layout_segment *lseg,
+                struct list_head *tmp_list)
+{
+        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+                atomic_read(&lseg->pls_refcount),
+                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+        if (atomic_dec_and_test(&lseg->pls_refcount)) {
+                struct inode *ino = lseg->pls_layout->plh_inode;
+                BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+                list_del(&lseg->pls_list);
+                if (list_empty(&lseg->pls_layout->plh_segs)) {
+                        struct nfs_client *clp;
+                        clp = NFS_SERVER(ino)->nfs_client;
+                        spin_lock(&clp->cl_lock);
+                        /* List does not take a reference, so no need for put here */
+                        list_del_init(&lseg->pls_layout->plh_layouts);
+                        spin_unlock(&clp->cl_lock);
+                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
+                }
+                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+                list_add(&lseg->pls_list, tmp_list);
+                return 1;
+        }
+        return 0;
+}
+static bool
+should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
+{
+        return (recall_iomode == IOMODE_ANY ||
+                lseg_iomode == recall_iomode);
+}
+/* Returns 1 if lseg is removed from list, 0 otherwise */
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+                             struct list_head *tmp_list)
+{
+        int rv = 0;
+        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+                /* Remove the reference keeping the lseg in the
+                 * list.  It will now be removed when all
+                 * outstanding io is finished.
+                 */
+                rv = put_lseg_locked(lseg, tmp_list);
+        }
+        return rv;
+}
+/* Returns count of number of matching invalid lsegs remaining in list
+ * after call.
+ */
+int
+mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+                            struct list_head *tmp_list,
+                            u32 iomode)
+{
+        struct pnfs_layout_segment *lseg, *next;
+        int invalid = 0, removed = 0;
+        dprintk("%s:Begin lo %p\n", __func__, lo);
+        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+                if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+                        dprintk("%s: freeing lseg %p iomode %d "
+                                "offset %llu length %llu\n", __func__,
+                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
+                                lseg->pls_range.length);
+                        invalid++;
+                        removed += mark_lseg_invalid(lseg, tmp_list);
+                }
+        dprintk("%s:Return %i\n", __func__, invalid - removed);
+        return invalid - removed;
+}
+void
+pnfs_free_lseg_list(struct list_head *free_me)
+{
+        struct pnfs_layout_segment *lseg, *tmp;
+        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
+                list_del(&lseg->pls_list);
+                free_lseg(lseg);
+        }
+}
+void
+pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+        struct pnfs_layout_hdr *lo;
+        LIST_HEAD(tmp_list);
+        spin_lock(&nfsi->vfs_inode.i_lock);
+        lo = nfsi->layout;
+        if (lo) {
+                set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+                mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
+                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
+                put_layout_hdr_locked(lo);
+        }
+        spin_unlock(&nfsi->vfs_inode.i_lock);
+        pnfs_free_lseg_list(&tmp_list);
+}
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+        struct pnfs_layout_hdr *lo;
+        LIST_HEAD(tmp_list);
+        spin_lock(&clp->cl_lock);
+        list_splice_init(&clp->cl_layouts, &tmp_list);
+        spin_unlock(&clp->cl_lock);
+        while (!list_empty(&tmp_list)) {
+                lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
+                                plh_layouts);
+                dprintk("%s freeing layout for inode %lu\n", __func__,
+                        lo->plh_inode->i_ino);
+                pnfs_destroy_layout(NFS_I(lo->plh_inode));
+        }
+}
+/* update lo->plh_stateid with new if is more recent */
+void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+                        bool update_barrier)
+{
+        u32 oldseq, newseq;
+        oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+        newseq = be32_to_cpu(new->stateid.seqid);
+        if ((int)(newseq - oldseq) > 0) {
+                memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
+                if (update_barrier) {
+                        u32 new_barrier = be32_to_cpu(new->stateid.seqid);
+                        if ((int)(new_barrier - lo->plh_barrier))
+                                lo->plh_barrier = new_barrier;
+                } else {
+                        /* Because of wraparound, we want to keep the barrier
+                         * "close" to the current seqids.  It needs to be
+                         * within 2**31 to count as "behind", so if it
+                         * gets too near that limit, give us a litle leeway
+                         * and bring it to within 2**30.
+                         * NOTE - and yes, this is all unsigned arithmetic.
+                         */
+                        if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+                                lo->plh_barrier = newseq - (1 << 30);
+                }
+        }
+}
+/* lget is set to 1 if called from inside send_layoutget call chain */
+static bool
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+                        int lget)
+{
+        if ((stateid) &&
+            (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
+                return true;
+        return lo->plh_block_lgets ||
+                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+                (list_empty(&lo->plh_segs) &&
+                 (atomic_read(&lo->plh_outstanding) > lget));
+}
+int
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+                              struct nfs4_state *open_state)
+{
+        int status = 0;
+        dprintk("--> %s\n", __func__);
+        spin_lock(&lo->plh_inode->i_lock);
+        if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
+                status = -EAGAIN;
+        } else if (list_empty(&lo->plh_segs)) {
+                int seq;
+                do {
+                        seq = read_seqbegin(&open_state->seqlock);
+                        memcpy(dst->data, open_state->stateid.data,
+                               sizeof(open_state->stateid.data));
+                } while (read_seqretry(&open_state->seqlock, seq));
+        } else
+                memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+        spin_unlock(&lo->plh_inode->i_lock);
+        dprintk("<-- %s\n", __func__);
+        return status;
+}
+/*
+* Get layout from server.
+*    for now, assume that whole file layouts are requested.
+*    arg->offset: 0
+*    arg->length: all ones
+*/
+static struct pnfs_layout_segment *
+send_layoutget(struct pnfs_layout_hdr *lo,
+           struct nfs_open_context *ctx,
+           u32 iomode)
+{
+        struct inode *ino = lo->plh_inode;
+        struct nfs_server *server = NFS_SERVER(ino);
+        struct nfs4_layoutget *lgp;
+        struct pnfs_layout_segment *lseg = NULL;
+        dprintk("--> %s\n", __func__);
+        BUG_ON(ctx == NULL);
+        lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
+        if (lgp == NULL)
+                return NULL;
+        lgp->args.minlength = NFS4_MAX_UINT64;
+        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+        lgp->args.range.iomode = iomode;
+        lgp->args.range.offset = 0;
+        lgp->args.range.length = NFS4_MAX_UINT64;
+        lgp->args.type = server->pnfs_curr_ld->id;
+        lgp->args.inode = ino;
+        lgp->args.ctx = get_nfs_open_context(ctx);
+        lgp->lsegpp = &lseg;
+        /* Synchronously retrieve layout information from server and
+         * store in lseg.
+         */
+        nfs4_proc_layoutget(lgp);
+        if (!lseg) {
+                /* remember that LAYOUTGET failed and suspend trying */
+                set_bit(lo_fail_bit(iomode), &lo->plh_flags);
+        }
+        return lseg;
+}
+bool pnfs_roc(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        struct pnfs_layout_segment *lseg, *tmp;
+        LIST_HEAD(tmp_list);
+        bool found = false;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+                goto out_nolayout;
+        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        mark_lseg_invalid(lseg, &tmp_list);
+                        found = true;
+                }
+        if (!found)
+                goto out_nolayout;
+        lo->plh_block_lgets++;
+        get_layout_hdr(lo); /* matched in pnfs_roc_release */
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&tmp_list);
+        return true;
+out_nolayout:
+        spin_unlock(&ino->i_lock);
+        return false;
+}
+void pnfs_roc_release(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        lo->plh_block_lgets--;
+        put_layout_hdr_locked(lo);
+        spin_unlock(&ino->i_lock);
+}
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        if ((int)(barrier - lo->plh_barrier) > 0)
+                lo->plh_barrier = barrier;
+        spin_unlock(&ino->i_lock);
+}
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_segment *lseg;
+        bool found = false;
+        spin_lock(&ino->i_lock);
+        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        found = true;
+                        break;
+                }
+        if (!found) {
+                struct pnfs_layout_hdr *lo = nfsi->layout;
+                u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+                /* Since close does not return a layout stateid for use as
+                 * a barrier, we choose the worst-case barrier.
+                 */
+                *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+        }
+        spin_unlock(&ino->i_lock);
+        return found;
+}
+/*
+ * Compare two layout segments for sorting into layout cache.
+ * We want to preferentially return RW over RO layouts, so ensure those
+ * are seen first.
+ */
+static s64
+cmp_layout(u32 iomode1, u32 iomode2)
+{
+        /* read > read/write */
+        return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+}
+static void
+pnfs_insert_layout(struct pnfs_layout_hdr *lo,
+                   struct pnfs_layout_segment *lseg)
+{
+        struct pnfs_layout_segment *lp;
+        int found = 0;
+        dprintk("%s:Begin\n", __func__);
+        assert_spin_locked(&lo->plh_inode->i_lock);
+        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
+                if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
+                        continue;
+                list_add_tail(&lseg->pls_list, &lp->pls_list);
+                dprintk("%s: inserted lseg %p "
+                        "iomode %d offset %llu length %llu before "
+                        "lp %p iomode %d offset %llu length %llu\n",
+                        __func__, lseg, lseg->pls_range.iomode,
+                        lseg->pls_range.offset, lseg->pls_range.length,
+                        lp, lp->pls_range.iomode, lp->pls_range.offset,
+                        lp->pls_range.length);
+                found = 1;
+                break;
+        }
+        if (!found) {
+                list_add_tail(&lseg->pls_list, &lo->plh_segs);
+                dprintk("%s: inserted lseg %p "
+                        "iomode %d offset %llu length %llu at tail\n",
+                        __func__, lseg, lseg->pls_range.iomode,
+                        lseg->pls_range.offset, lseg->pls_range.length);
+        }
+        get_layout_hdr(lo);
+        dprintk("%s:Return\n", __func__);
+}
+static struct pnfs_layout_hdr *
+alloc_init_layout_hdr(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
+        if (!lo)
+                return NULL;
+        atomic_set(&lo->plh_refcount, 1);
+        INIT_LIST_HEAD(&lo->plh_layouts);
+        INIT_LIST_HEAD(&lo->plh_segs);
+        INIT_LIST_HEAD(&lo->plh_bulk_recall);
+        lo->plh_inode = ino;
+        return lo;
+}
+static struct pnfs_layout_hdr *
+pnfs_find_alloc_layout(struct inode *ino)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_hdr *new = NULL;
+        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+        assert_spin_locked(&ino->i_lock);
+        if (nfsi->layout) {
+                if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
+                        return NULL;
+                else
+                        return nfsi->layout;
+        }
+        spin_unlock(&ino->i_lock);
+        new = alloc_init_layout_hdr(ino);
+        spin_lock(&ino->i_lock);
+        if (likely(nfsi->layout == NULL))       /* Won the race? */
+                nfsi->layout = new;
+        else
+                kfree(new);
+        return nfsi->layout;
+}
+/*
+ * iomode matching rules:
+ * iomode       lseg    match
+ * -----        -----   -----
+ * ANY          READ    true
+ * ANY          RW      true
+ * RW           READ    false
+ * RW           RW      true
+ * READ         READ    true
+ * READ         RW      true
+ */
+static int
+is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+{
+        return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
+}
+/*
+ * lookup range in layout
+ */
+static struct pnfs_layout_segment *
+pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+        struct pnfs_layout_segment *lseg, *ret = NULL;
+        dprintk("%s:Begin\n", __func__);
+        assert_spin_locked(&lo->plh_inode->i_lock);
+        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+                    is_matching_lseg(lseg, iomode)) {
+                        ret = lseg;
+                        break;
+                }
+                if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
+                        break;
+        }
+        dprintk("%s:Return lseg %p ref %d\n",
+                __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
+        return ret;
+}
+/*
+ * Layout segment is retreived from the server if not cached.
+ * The appropriate layout segment is referenced and returned to the caller.
+ */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino,
+                   struct nfs_open_context *ctx,
+                   enum pnfs_iomode iomode)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
+        struct pnfs_layout_hdr *lo;
+        struct pnfs_layout_segment *lseg = NULL;
+        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+                return NULL;
+        spin_lock(&ino->i_lock);
+        lo = pnfs_find_alloc_layout(ino);
+        if (lo == NULL) {
+                dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
+                goto out_unlock;
+        }
+        /* Do we even need to bother with this? */
+        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+                dprintk("%s matches recall, use MDS\n", __func__);
+                goto out_unlock;
+        }
+        /* Check to see if the layout for the given range already exists */
+        lseg = pnfs_find_lseg(lo, iomode);
+        if (lseg)
+                goto out_unlock;
+        /* if LAYOUTGET already failed once we don't try again */
+        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
+                goto out_unlock;
+        if (pnfs_layoutgets_blocked(lo, NULL, 0))
+                goto out_unlock;
+        atomic_inc(&lo->plh_outstanding);
+        get_layout_hdr(lo);
+        if (list_empty(&lo->plh_segs)) {
+                /* The lo must be on the clp list if there is any
+                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
+                 */
+                spin_lock(&clp->cl_lock);
+                BUG_ON(!list_empty(&lo->plh_layouts));
+                list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
+        spin_unlock(&ino->i_lock);
+        lseg = send_layoutget(lo, ctx, iomode);
+        if (!lseg) {
+                spin_lock(&ino->i_lock);
+                if (list_empty(&lo->plh_segs)) {
+                        spin_lock(&clp->cl_lock);
+                        list_del_init(&lo->plh_layouts);
+                        spin_unlock(&clp->cl_lock);
+                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                }
+                spin_unlock(&ino->i_lock);
+        }
+        atomic_dec(&lo->plh_outstanding);
+        put_layout_hdr(lo);
+out:
+        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
+                nfsi->layout->plh_flags, lseg);
+        return lseg;
+out_unlock:
+        spin_unlock(&ino->i_lock);
+        goto out;
+}
+int
+pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
+        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+        struct nfs4_layoutget_res *res = &lgp->res;
+        struct pnfs_layout_segment *lseg;
+        struct inode *ino = lo->plh_inode;
+        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
+        int status = 0;
+        /* Verify we got what we asked for.
+         * Note that because the xdr parsing only accepts a single
+         * element array, this can fail even if the server is behaving
+         * correctly.
+         */
+        if (lgp->args.range.iomode > res->range.iomode ||
+            res->range.offset != 0 ||
+            res->range.length != NFS4_MAX_UINT64) {
+                status = -EINVAL;
+                goto out;
+        }
+        /* Inject layout blob into I/O device driver */
+        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
+        if (!lseg || IS_ERR(lseg)) {
+                if (!lseg)
+                        status = -ENOMEM;
+                else
+                        status = PTR_ERR(lseg);
+                dprintk("%s: Could not allocate layout: error %d\n",
+                       __func__, status);
+                goto out;
+        }
+        spin_lock(&ino->i_lock);
+        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+                dprintk("%s forget reply due to recall\n", __func__);
+                goto out_forget_reply;
+        }
+        if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+                dprintk("%s forget reply due to state\n", __func__);
+                goto out_forget_reply;
+        }
+        init_lseg(lo, lseg);
+        lseg->pls_range = res->range;
+        *lgp->lsegpp = lseg;
+        pnfs_insert_layout(lo, lseg);
+        if (res->return_on_close) {
+                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+                set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+        }
+        /* Done processing layoutget. Set the layout stateid */
+        pnfs_set_layout_stateid(lo, &res->stateid, false);
+        spin_unlock(&ino->i_lock);
+out:
+        return status;
+out_forget_reply:
+        spin_unlock(&ino->i_lock);
+        lseg->pls_layout = lo;
+        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+        goto out;
+}
+/*
+ * Device ID cache. Currently supports one layout type per struct nfs_client.
+ * Add layout type to the lookup key to expand to support multiple types.
+ */
+int
+pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
+                         void (*free_callback)(struct pnfs_deviceid_node *))
+{
+        struct pnfs_deviceid_cache *c;
+        c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
+        if (!c)
+                return -ENOMEM;
+        spin_lock(&clp->cl_lock);
+        if (clp->cl_devid_cache != NULL) {
+                atomic_inc(&clp->cl_devid_cache->dc_ref);
+                dprintk("%s [kref [%d]]\n", __func__,
+                        atomic_read(&clp->cl_devid_cache->dc_ref));
+                kfree(c);
+        } else {
+                /* kzalloc initializes hlists */
+                spin_lock_init(&c->dc_lock);
+                atomic_set(&c->dc_ref, 1);
+                c->dc_free_callback = free_callback;
+                clp->cl_devid_cache = c;
+                dprintk("%s [new]\n", __func__);
+        }
+        spin_unlock(&clp->cl_lock);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
+/*
+ * Called from pnfs_layoutdriver_type->free_lseg
+ * last layout segment reference frees deviceid
+ */
+void
+pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+                  struct pnfs_deviceid_node *devid)
+{
+        struct nfs4_deviceid *id = &devid->de_id;
+        struct pnfs_deviceid_node *d;
+        struct hlist_node *n;
+        long h = nfs4_deviceid_hash(id);
+        dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+        if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+                return;
+        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+                if (!memcmp(&d->de_id, id, sizeof(*id))) {
+                        hlist_del_rcu(&d->de_node);
+                        spin_unlock(&c->dc_lock);
+                        synchronize_rcu();
+                        c->dc_free_callback(devid);
+                        return;
+                }
+        spin_unlock(&c->dc_lock);
+        /* Why wasn't it found in  the list? */
+        BUG();
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+/* Find and reference a deviceid */
+struct pnfs_deviceid_node *
+pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
+{
+        struct pnfs_deviceid_node *d;
+        struct hlist_node *n;
+        long hash = nfs4_deviceid_hash(id);
+        dprintk("--> %s hash %ld\n", __func__, hash);
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+                if (!memcmp(&d->de_id, id, sizeof(*id))) {
+                        if (!atomic_inc_not_zero(&d->de_ref)) {
+                                goto fail;
+                        } else {
+                                rcu_read_unlock();
+                                return d;
+                        }
+                }
+        }
+fail:
+        rcu_read_unlock();
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
+/*
+ * Add a deviceid to the cache.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+struct pnfs_deviceid_node *
+pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
+{
+        struct pnfs_deviceid_node *d;
+        long hash = nfs4_deviceid_hash(&new->de_id);
+        dprintk("--> %s hash %ld\n", __func__, hash);
+        spin_lock(&c->dc_lock);
+        d = pnfs_find_get_deviceid(c, &new->de_id);
+        if (d) {
+                spin_unlock(&c->dc_lock);
+                dprintk("%s [discard]\n", __func__);
+                c->dc_free_callback(new);
+                return d;
+        }
+        INIT_HLIST_NODE(&new->de_node);
+        atomic_set(&new->de_ref, 1);
+        hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
+        spin_unlock(&c->dc_lock);
+        dprintk("%s [new]\n", __func__);
+        return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
+void
+pnfs_put_deviceid_cache(struct nfs_client *clp)
+{
+        struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+        dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
+        if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
+                int i;
+                /* Verify cache is empty */
+                for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
+                        BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
+                clp->cl_devid_cache = NULL;
+                spin_unlock(&clp->cl_lock);
+                kfree(local);
+        }
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 000000000000..e2612ea0cbed
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,235 @@
+/*
+ *  pNFS client data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#ifndef FS_NFS_PNFS_H
+#define FS_NFS_PNFS_H
+enum {
+        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
+        NFS_LSEG_ROC,           /* roc bit received from server */
+};
+struct pnfs_layout_segment {
+        struct list_head pls_list;
+        struct pnfs_layout_range pls_range;
+        atomic_t pls_refcount;
+        unsigned long pls_flags;
+        struct pnfs_layout_hdr *pls_layout;
+};
+#ifdef CONFIG_NFS_V4_1
+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+enum {
+        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
+        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
+        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
+        NFS_LAYOUT_ROC,                 /* some lseg had roc bit set */
+        NFS_LAYOUT_DESTROYED,           /* no new use of layout allowed */
+};
+/* Per-layout driver specific registration structure */
+struct pnfs_layoutdriver_type {
+        struct list_head pnfs_tblid;
+        const u32 id;
+        const char *name;
+        struct module *owner;
+        int (*set_layoutdriver) (struct nfs_server *);
+        int (*clear_layoutdriver) (struct nfs_server *);
+        struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
+        void (*free_lseg) (struct pnfs_layout_segment *lseg);
+};
+struct pnfs_layout_hdr {
+        atomic_t                plh_refcount;
+        struct list_head        plh_layouts;   /* other client layouts */
+        struct list_head        plh_bulk_recall; /* clnt list of bulk recalls */
+        struct list_head        plh_segs;      /* layout segments list */
+        nfs4_stateid            plh_stateid;
+        atomic_t                plh_outstanding; /* number of RPCs out */
+        unsigned long           plh_block_lgets; /* block LAYOUTGET if >0 */
+        u32                     plh_barrier; /* ignore lower seqids */
+        unsigned long           plh_flags;
+        struct inode            *plh_inode;
+};
+struct pnfs_device {
+        struct nfs4_deviceid dev_id;
+        unsigned int  layout_type;
+        unsigned int  mincount;
+        struct page **pages;
+        void          *area;
+        unsigned int  pgbase;
+        unsigned int  pglen;
+};
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS        5
+#define NFS4_DEVICE_ID_HASH_SIZE        (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK        (NFS4_DEVICE_ID_HASH_SIZE - 1)
+static inline u32
+nfs4_deviceid_hash(struct nfs4_deviceid *id)
+{
+        unsigned char *cptr = (unsigned char *)id->data;
+        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+        u32 x = 0;
+        while (nbytes--) {
+                x *= 37;
+                x += *cptr++;
+        }
+        return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+struct pnfs_deviceid_node {
+        struct hlist_node       de_node;
+        struct nfs4_deviceid    de_id;
+        atomic_t                de_ref;
+};
+struct pnfs_deviceid_cache {
+        spinlock_t              dc_lock;
+        atomic_t                dc_ref;
+        void                    (*dc_free_callback)(struct pnfs_deviceid_node *);
+        struct hlist_head       dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
+};
+extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
+                        void (*free_callback)(struct pnfs_deviceid_node *));
+extern void pnfs_put_deviceid_cache(struct nfs_client *);
+extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
+                                struct pnfs_deviceid_cache *,
+                                struct nfs4_deviceid *);
+extern struct pnfs_deviceid_node *pnfs_add_deviceid(
+                                struct pnfs_deviceid_cache *,
+                                struct pnfs_deviceid_node *);
+extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+                              struct pnfs_deviceid_node *devid);
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+/* nfs4proc.c */
+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+                                   struct pnfs_device *dev);
+extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+/* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+                   enum pnfs_iomode access_type);
+void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void unset_pnfs_layoutdriver(struct nfs_server *);
+int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
+void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_all_layouts(struct nfs_client *);
+void put_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+                             const nfs4_stateid *new,
+                             bool update_barrier);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+                                  struct pnfs_layout_hdr *lo,
+                                  struct nfs4_state *open_state);
+int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+                                struct list_head *tmp_list,
+                                u32 iomode);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
+static inline int lo_fail_bit(u32 iomode)
+{
+        return iomode == IOMODE_RW ?
+                         NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+/* Return true if a layout driver is being used for this mountpoint */
+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
+        return nfss->pnfs_curr_ld != NULL;
+}
+#else  /* CONFIG_NFS_V4_1 */
+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+}
+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+}
+static inline struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+                   enum pnfs_iomode access_type)
+{
+        return NULL;
+}
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+        return false;
+}
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+        return false;
+}
+static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+{
+}
+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 611bec22f552..77d5e21c4ad6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data)
 static int
 nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                int flags, struct nameidata *nd)
+                int flags, struct nfs_open_context *ctx)
 {
        struct nfs_createdata *data;
        struct rpc_message msg = {
@@ -365,17 +365,32 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        return 1;
 }
+static void
+nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+        msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
+}
+static int
+nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+                     struct inode *new_dir)
+{
+        if (nfs_async_handle_expired_key(task))
+                return 0;
+        nfs_mark_for_revalidate(old_dir);
+        nfs_mark_for_revalidate(new_dir);
+        return 1;
+}
 static int
 nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
                struct inode *new_dir, struct qstr *new_name)
 {
        struct nfs_renameargs   arg = {
-                .fromfh         = NFS_FH(old_dir),
+                .old_dir        = NFS_FH(old_dir),
-                .fromname       = old_name->name,
+                .old_name       = old_name,
-                .fromlen        = old_name->len,
+                .new_dir        = NFS_FH(new_dir),
-                .tofh           = NFS_FH(new_dir),
+                .new_name       = new_name,
-                .toname         = new_name->name,
-                .tolen          = new_name->len
        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_RENAME],
@@ -443,7 +458,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
        fattr = nfs_alloc_fattr();
        status = -ENOMEM;
        if (fh == NULL || fattr == NULL)
-                goto out;
+                goto out_free;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
@@ -456,6 +471,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
        if (status == 0)
                status = nfs_instantiate(dentry, fh, fattr);
+out_free:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fh);
 out:
@@ -519,14 +535,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
 */
 static int
 nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                 u64 cookie, struct page *page, unsigned int count, int plus)
+                 u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
        struct nfs_readdirargs  arg = {
                .fh             = NFS_FH(dir),
                .cookie         = cookie,
                .count          = count,
-                .pages          = &page,
+                .pages          = pages,
        };
        struct rpc_message      msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_READDIR],
@@ -705,6 +721,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .unlink_setup   = nfs_proc_unlink_setup,
        .unlink_done    = nfs_proc_unlink_done,
        .rename         = nfs_proc_rename,
+        .rename_setup   = nfs_proc_rename_setup,
+        .rename_done    = nfs_proc_rename_done,
        .link           = nfs_proc_link,
        .symlink        = nfs_proc_symlink,
        .mkdir          = nfs_proc_mkdir,
@@ -714,7 +732,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .statfs         = nfs_proc_statfs,
        .fsinfo         = nfs_proc_fsinfo,
        .pathconf       = nfs_proc_pathconf,
-        .decode_dirent  = nfs_decode_dirent,
+        .decode_dirent  = nfs2_decode_dirent,
        .read_setup     = nfs_proc_read_setup,
        .read_done      = nfs_read_done,
        .write_setup    = nfs_proc_write_setup,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 87adc2744246..aedcaa7f291f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -25,6 +25,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -46,7 +47,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
-                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -121,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
+        pnfs_update_layout(inode, ctx, IOMODE_READ);
        new = nfs_create_request(ctx, inode, page, 0, len);
        if (IS_ERR(new)) {
                unlock_page(page);
@@ -151,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req)
                        (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
                        req->wb_bytes,
                        (long long)req_offset(req));
-        nfs_clear_request(req);
        nfs_release_request(req);
 }
@@ -625,6 +625,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        if (ret == 0)
                goto read_complete; /* all pages were read */
+        pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f4cbf0c306c6..b68c8607770f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -39,7 +39,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/mnt_namespace.h>
@@ -67,6 +66,12 @@
 #define NFSDBG_FACILITY         NFSDBG_VFS
+#ifdef CONFIG_NFS_V3
+#define NFS_DEFAULT_VERSION 3
+#else
+#define NFS_DEFAULT_VERSION 2
+#endif
 enum {
        /* Mount options that take no arguments */
        Opt_soft, Opt_hard,
@@ -100,6 +105,7 @@ enum {
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
        Opt_lookupcache,
        Opt_fscache_uniq,
+        Opt_local_lock,
        /* Special mount options */
        Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -171,6 +177,7 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_lookupcache, "lookupcache=%s" },
        { Opt_fscache_uniq, "fsc=%s" },
+        { Opt_local_lock, "local_lock=%s" },
        { Opt_err, NULL }
 };
@@ -236,14 +243,30 @@ static match_table_t nfs_lookupcache_tokens = {
        { Opt_lookupcache_err, NULL }
 };
+enum {
+        Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
+        Opt_local_lock_none,
+        Opt_local_lock_err
+};
+static match_table_t nfs_local_lock_tokens = {
+        { Opt_local_lock_all, "all" },
+        { Opt_local_lock_flock, "flock" },
+        { Opt_local_lock_posix, "posix" },
+        { Opt_local_lock_none, "none" },
+        { Opt_local_lock_err, NULL }
+};
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
-static int nfs_xdev_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
-                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+                int flags, const char *dev_name, void *raw_data);
 static void nfs_put_super(struct super_block *);
 static void nfs_kill_super(struct super_block *);
 static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
@@ -259,7 +282,7 @@ static struct file_system_type nfs_fs_type = {
 struct file_system_type nfs_xdev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs",
-        .get_sb         = nfs_xdev_get_sb,
+        .mount          = nfs_xdev_mount,
        .kill_sb        = nfs_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -284,14 +307,14 @@ static int nfs4_try_mount(int flags, const char *dev_name,
        struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
 static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_remote_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
-static int nfs4_xdev_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static int nfs4_referral_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
@@ -305,7 +328,7 @@ static struct file_system_type nfs4_fs_type = {
 static struct file_system_type nfs4_remote_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_remote_get_sb,
+        .mount          = nfs4_remote_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -313,7 +336,7 @@ static struct file_system_type nfs4_remote_fs_type = {
 struct file_system_type nfs4_xdev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_xdev_get_sb,
+        .mount          = nfs4_xdev_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -321,7 +344,7 @@ struct file_system_type nfs4_xdev_fs_type = {
 static struct file_system_type nfs4_remote_referral_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_remote_referral_get_sb,
+        .mount          = nfs4_remote_referral_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -575,7 +598,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
        if (nfss->mountd_version || showdefaults)
                seq_printf(m, ",mountvers=%u", nfss->mountd_version);
-        if (nfss->mountd_port || showdefaults)
+        if ((nfss->mountd_port &&
+                nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
+                showdefaults)
                seq_printf(m, ",mountport=%u", nfss->mountd_port);
        nfs_show_mountd_netid(m, nfss, showdefaults);
@@ -622,6 +647,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        const struct proc_nfs_info *nfs_infop;
        struct nfs_client *clp = nfss->nfs_client;
        u32 version = clp->rpc_ops->version;
+        int local_flock, local_fcntl;
        seq_printf(m, ",vers=%u", version);
        seq_printf(m, ",rsize=%u", nfss->rsize);
@@ -670,6 +696,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                else
                        seq_printf(m, ",lookupcache=pos");
        }
+        local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
+        local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
+        if (!local_flock && !local_fcntl)
+                seq_printf(m, ",local_lock=none");
+        else if (local_flock && local_fcntl)
+                seq_printf(m, ",local_lock=all");
+        else if (local_flock)
+                seq_printf(m, ",local_lock=flock");
+        else
+                seq_printf(m, ",local_lock=posix");
 }
 /*
@@ -1017,9 +1055,13 @@ static int nfs_parse_mount_options(char *raw,
                        break;
                case Opt_lock:
                        mnt->flags &= ~NFS_MOUNT_NONLM;
+                        mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+                                        NFS_MOUNT_LOCAL_FCNTL);
                        break;
                case Opt_nolock:
                        mnt->flags |= NFS_MOUNT_NONLM;
+                        mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+                                       NFS_MOUNT_LOCAL_FCNTL);
                        break;
                case Opt_v2:
                        mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1029,12 +1071,10 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->flags |= NFS_MOUNT_VER3;
                        mnt->version = 3;
                        break;
-#ifdef CONFIG_NFS_V4
                case Opt_v4:
                        mnt->flags &= ~NFS_MOUNT_VER3;
                        mnt->version = 4;
                        break;
-#endif
                case Opt_udp:
                        mnt->flags &= ~NFS_MOUNT_TCP;
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1246,12 +1286,10 @@ static int nfs_parse_mount_options(char *raw,
                                mnt->flags |= NFS_MOUNT_VER3;
                                mnt->version = 3;
                                break;
-#ifdef CONFIG_NFS_V4
                        case NFS4_VERSION:
                                mnt->flags &= ~NFS_MOUNT_VER3;
                                mnt->version = 4;
                                break;
-#endif
                        default:
                                goto out_invalid_value;
                        }
@@ -1420,6 +1458,34 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->fscache_uniq = string;
                        mnt->options |= NFS_OPTION_FSCACHE;
                        break;
+                case Opt_local_lock:
+                        string = match_strdup(args);
+                        if (string == NULL)
+                                goto out_nomem;
+                        token = match_token(string, nfs_local_lock_tokens,
+                                        args);
+                        kfree(string);
+                        switch (token) {
+                        case Opt_local_lock_all:
+                                mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+                                               NFS_MOUNT_LOCAL_FCNTL);
+                                break;
+                        case Opt_local_lock_flock:
+                                mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
+                                break;
+                        case Opt_local_lock_posix:
+                                mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
+                                break;
+                        case Opt_local_lock_none:
+                                mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+                                                NFS_MOUNT_LOCAL_FCNTL);
+                                break;
+                        default:
+                                dfprintk(MOUNT, "NFS:   invalid "
+                                                "local_lock argument\n");
+                                return 0;
+                        };
+                        break;
                /*
                 * Special options
@@ -1825,6 +1891,12 @@ static int nfs_validate_mount_data(void *options,
                if (!args->nfs_server.hostname)
                        goto out_nomem;
+                if (!(data->flags & NFS_MOUNT_NONLM))
+                        args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
+                                         NFS_MOUNT_LOCAL_FCNTL);
+                else
+                        args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
+                                        NFS_MOUNT_LOCAL_FCNTL);
                /*
                 * The legacy version 6 binary mount data from userspace has a
                 * field used only to transport selinux information into the
@@ -2130,6 +2202,7 @@ static int nfs_set_super(struct super_block *s, void *data)
        s->s_flags = sb_mntdata->mntflags;
        s->s_fs_info = server;
+        s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
        ret = set_anon_super(s, server);
        if (ret == 0)
                server->s_dev = s->s_dev;
@@ -2208,7 +2281,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        };
        int error = -ENOMEM;
-        data = nfs_alloc_parsed_mount_data(3);
+        data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
@@ -2328,9 +2401,9 @@ static void nfs_kill_super(struct super_block *s)
 /*
 * Clone an NFS2/3 server record on xdev traversal (FSID-change)
 */
-static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *
-                           const char *dev_name, void *raw_data,
+nfs_xdev_mount(struct file_system_type *fs_type, int flags,
-                           struct vfsmount *mnt)
+                const char *dev_name, void *raw_data)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -2342,7 +2415,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
        };
        int error;
-        dprintk("--> nfs_xdev_get_sb()\n");
+        dprintk("--> nfs_xdev_mount()\n");
        /* create a new volume representation */
        server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2389,28 +2462,26 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
        }
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
        /* clone any lsm security options from the parent to the new sb */
        security_sb_clone_mnt_opts(data->sb, s);
-        dprintk("<-- nfs_xdev_get_sb() = 0\n");
+        dprintk("<-- nfs_xdev_mount() = 0\n");
-        return 0;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
 out_err_noserver:
-        dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error);
+        dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
-        return error;
+        return ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
                bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
        deactivate_locked_super(s);
-        dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
+        dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
-        return error;
+        return ERR_PTR(error);
 }
 #ifdef CONFIG_NFS_V4
@@ -2426,7 +2497,13 @@ static void nfs4_clone_super(struct super_block *sb,
        sb->s_maxbytes = old_sb->s_maxbytes;
        sb->s_time_gran = 1;
        sb->s_op = old_sb->s_op;
-        nfs_initialise_sb(sb);
+        /*
+         * The VFS shouldn't apply the umask to mode bits. We will do
+         * so ourselves when necessary.
+         */
+        sb->s_flags  |= MS_POSIXACL;
+        sb->s_xattr  = old_sb->s_xattr;
+        nfs_initialise_sb(sb);
 }
 /*
@@ -2436,12 +2513,19 @@ static void nfs4_fill_super(struct super_block *sb)
 {
        sb->s_time_gran = 1;
        sb->s_op = &nfs4_sops;
+        /*
+         * The VFS shouldn't apply the umask to mode bits. We will do
+         * so ourselves when necessary.
+         */
+        sb->s_flags  |= MS_POSIXACL;
+        sb->s_xattr = nfs4_xattr_handlers;
        nfs_initialise_sb(sb);
 }
 static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
 {
-        args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
+        args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
+                         NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
 }
 static int nfs4_validate_text_mount_data(void *options,
@@ -2579,8 +2663,9 @@ out_no_address:
 /*
 * Get the superblock for the NFS4 root partition
 */
-static int nfs4_remote_get_sb(struct file_system_type *fs_type,
+static struct dentry *
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+nfs4_remote_mount(struct file_system_type *fs_type, int flags,
+                  const char *dev_name, void *raw_data)
 {
        struct nfs_parsed_mount_data *data = raw_data;
        struct super_block *s;
@@ -2644,15 +2729,16 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
                goto error_splat_root;
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
+        security_free_mnt_opts(&data->lsm_opts);
-        error = 0;
+        nfs_free_fhandle(mntfh);
+        return mntroot;
 out:
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        nfs_free_fhandle(mntfh);
-        return error;
+        return ERR_PTR(error);
 out_free:
        nfs_free_server(server);
@@ -2898,9 +2984,9 @@ static void nfs4_kill_super(struct super_block *sb)
 /*
 * Clone an NFS4 server record on xdev traversal (FSID-change)
 */
-static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *
-                            const char *dev_name, void *raw_data,
+nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
-                            struct vfsmount *mnt)
+                 const char *dev_name, void *raw_data)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -2912,7 +2998,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
        };
        int error;
-        dprintk("--> nfs4_xdev_get_sb()\n");
+        dprintk("--> nfs4_xdev_mount()\n");
        /* create a new volume representation */
        server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2959,32 +3045,30 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
        }
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
        security_sb_clone_mnt_opts(data->sb, s);
-        dprintk("<-- nfs4_xdev_get_sb() = 0\n");
+        dprintk("<-- nfs4_xdev_mount() = 0\n");
-        return 0;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
 out_err_noserver:
-        dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error);
+        dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
-        return error;
+        return ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
                bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
        deactivate_locked_super(s);
-        dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
+        dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
-        return error;
+        return ERR_PTR(error);
 }
-static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *
-                int flags, const char *dev_name, void *raw_data,
+nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
-                struct vfsmount *mnt)
+                           const char *dev_name, void *raw_data)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -3048,14 +3132,12 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        }
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
        security_sb_clone_mnt_opts(data->sb, s);
        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = 0\n");
-        return 0;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
@@ -3063,7 +3145,7 @@ out_err_noserver:
        nfs_free_fhandle(mntfh);
 out_err_nofh:
        dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
-        return error;
+        return ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
@@ -3072,7 +3154,7 @@ error_splat_bdi:
        deactivate_locked_super(s);
        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
-        return error;
+        return ERR_PTR(error);
 }
 /*
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index ad4d2e787b20..978aaeb8a093 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = {
                .extra1 = (int *)&nfs_set_port_min,
                .extra2 = (int *)&nfs_set_port_max,
        },
+#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
        {
                .procname = "idmap_cache_timeout",
                .data = &nfs_idmap_cache_timeout,
@@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = {
                .mode = 0644,
                .proc_handler = proc_dointvec_jiffies,
        },
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 #endif
        {
                .procname       = "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 2f84adaad427..e313a51acdd1 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -13,9 +13,12 @@
 #include <linux/nfs_fs.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/namei.h>
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "iostat.h"
+#include "delegation.h"
 struct nfs_unlinkdata {
        struct hlist_node list;
@@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry)
 * @dir: parent directory of dentry
 * @dentry: dentry to unlink
 */
-int
+static int
 nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct nfs_unlinkdata *data;
@@ -259,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
                status = PTR_ERR(data->cred);
                goto out_free;
        }
-        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        data->res.dir_attr = &data->dir_attr;
        status = -EBUSY;
@@ -303,3 +305,256 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
        if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
                nfs_free_unlinkdata(data);
 }
+/* Cancel a queued async unlink. Called when a sillyrename run fails. */
+static void
+nfs_cancel_async_unlink(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+                struct nfs_unlinkdata *data = dentry->d_fsdata;
+                dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+                spin_unlock(&dentry->d_lock);
+                nfs_free_unlinkdata(data);
+                return;
+        }
+        spin_unlock(&dentry->d_lock);
+}
+struct nfs_renamedata {
+        struct nfs_renameargs   args;
+        struct nfs_renameres    res;
+        struct rpc_cred         *cred;
+        struct inode            *old_dir;
+        struct dentry           *old_dentry;
+        struct nfs_fattr        old_fattr;
+        struct inode            *new_dir;
+        struct dentry           *new_dentry;
+        struct nfs_fattr        new_fattr;
+};
+/**
+ * nfs_async_rename_done - Sillyrename post-processing
+ * @task: rpc_task of the sillyrename
+ * @calldata: nfs_renamedata for the sillyrename
+ *
+ * Do the directory attribute updates and the d_move
+ */
+static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs_renamedata *data = calldata;
+        struct inode *old_dir = data->old_dir;
+        struct inode *new_dir = data->new_dir;
+        if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
+                nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
+                return;
+        }
+        if (task->tk_status != 0) {
+                nfs_cancel_async_unlink(data->old_dentry);
+                return;
+        }
+        nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
+        d_move(data->old_dentry, data->new_dentry);
+}
+/**
+ * nfs_async_rename_release - Release the sillyrename data.
+ * @calldata: the struct nfs_renamedata to be released
+ */
+static void nfs_async_rename_release(void *calldata)
+{
+        struct nfs_renamedata   *data = calldata;
+        struct super_block *sb = data->old_dir->i_sb;
+        if (data->old_dentry->d_inode)
+                nfs_mark_for_revalidate(data->old_dentry->d_inode);
+        dput(data->old_dentry);
+        dput(data->new_dentry);
+        iput(data->old_dir);
+        iput(data->new_dir);
+        nfs_sb_deactive(sb);
+        put_rpccred(data->cred);
+        kfree(data);
+}
+#if defined(CONFIG_NFS_V4_1)
+static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs_renamedata *data = calldata;
+        struct nfs_server *server = NFS_SERVER(data->old_dir);
+        if (nfs4_setup_sequence(server, &data->args.seq_args,
+                                &data->res.seq_res, 1, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+static const struct rpc_call_ops nfs_rename_ops = {
+        .rpc_call_done = nfs_async_rename_done,
+        .rpc_release = nfs_async_rename_release,
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_rename_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+};
+/**
+ * nfs_async_rename - perform an asynchronous rename operation
+ * @old_dir: directory that currently holds the dentry to be renamed
+ * @new_dir: target directory for the rename
+ * @old_dentry: original dentry to be renamed
+ * @new_dentry: dentry to which the old_dentry should be renamed
+ *
+ * It's expected that valid references to the dentries and inodes are held
+ */
+static struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+                 struct dentry *old_dentry, struct dentry *new_dentry)
+{
+        struct nfs_renamedata *data;
+        struct rpc_message msg = { };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_message = &msg,
+                .callback_ops = &nfs_rename_ops,
+                .workqueue = nfsiod_workqueue,
+                .rpc_client = NFS_CLIENT(old_dir),
+                .flags = RPC_TASK_ASYNC,
+        };
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data == NULL)
+                return ERR_PTR(-ENOMEM);
+        task_setup_data.callback_data = data;
+        data->cred = rpc_lookup_cred();
+        if (IS_ERR(data->cred)) {
+                struct rpc_task *task = ERR_CAST(data->cred);
+                kfree(data);
+                return task;
+        }
+        msg.rpc_argp = &data->args;
+        msg.rpc_resp = &data->res;
+        msg.rpc_cred = data->cred;
+        /* set up nfs_renamedata */
+        data->old_dir = old_dir;
+        ihold(old_dir);
+        data->new_dir = new_dir;
+        ihold(new_dir);
+        data->old_dentry = dget(old_dentry);
+        data->new_dentry = dget(new_dentry);
+        nfs_fattr_init(&data->old_fattr);
+        nfs_fattr_init(&data->new_fattr);
+        /* set up nfs_renameargs */
+        data->args.old_dir = NFS_FH(old_dir);
+        data->args.old_name = &old_dentry->d_name;
+        data->args.new_dir = NFS_FH(new_dir);
+        data->args.new_name = &new_dentry->d_name;
+        /* set up nfs_renameres */
+        data->res.old_fattr = &data->old_fattr;
+        data->res.new_fattr = &data->new_fattr;
+        nfs_sb_active(old_dir->i_sb);
+        NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
+        return rpc_run_task(&task_setup_data);
+}
+/**
+ * nfs_sillyrename - Perform a silly-rename of a dentry
+ * @dir: inode of directory that contains dentry
+ * @dentry: dentry to be sillyrenamed
+ *
+ * NFSv2/3 is stateless and the server doesn't know when the client is
+ * holding a file open. To prevent application problems when a file is
+ * unlinked while it's still open, the client performs a "silly-rename".
+ * That is, it renames the file to a hidden file in the same directory,
+ * and only performs the unlink once the last reference to it is put.
+ *
+ * The final cleanup is done during dentry_iput.
+ */
+int
+nfs_sillyrename(struct inode *dir, struct dentry *dentry)
+{
+        static unsigned int sillycounter;
+        const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
+        const int      countersize = sizeof(sillycounter)*2;
+        const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
+        char           silly[slen+1];
+        struct dentry *sdentry;
+        struct rpc_task *task;
+        int            error = -EIO;
+        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
+                dentry->d_parent->d_name.name, dentry->d_name.name,
+                dentry->d_count);
+        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
+        /*
+         * We don't allow a dentry to be silly-renamed twice.
+         */
+        error = -EBUSY;
+        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+                goto out;
+        sprintf(silly, ".nfs%*.*Lx",
+                fileidsize, fileidsize,
+                (unsigned long long)NFS_FILEID(dentry->d_inode));
+        /* Return delegation in anticipation of the rename */
+        nfs_inode_return_delegation(dentry->d_inode);
+        sdentry = NULL;
+        do {
+                char *suffix = silly + slen - countersize;
+                dput(sdentry);
+                sillycounter++;
+                sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
+                dfprintk(VFS, "NFS: trying to rename %s to %s\n",
+                                dentry->d_name.name, silly);
+                sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+                /*
+                 * N.B. Better to return EBUSY here ... it could be
+                 * dangerous to delete the file while it's in use.
+                 */
+                if (IS_ERR(sdentry))
+                        goto out;
+        } while (sdentry->d_inode != NULL); /* need negative lookup */
+        /* queue unlink first. Can't do this from rpc_release as it
+         * has to allocate memory
+         */
+        error = nfs_async_unlink(dir, dentry);
+        if (error)
+                goto out_dput;
+        /* run the rename task, undo unlink if it fails */
+        task = nfs_async_rename(dir, dir, dentry, sdentry);
+        if (IS_ERR(task)) {
+                error = -EBUSY;
+                nfs_cancel_async_unlink(dentry);
+                goto out_dput;
+        }
+        /* wait for the RPC task to complete, unless a SIGKILL intervenes */
+        error = rpc_wait_for_completion_task(task);
+        if (error == 0)
+                error = task->tk_status;
+        rpc_put_task(task);
+out_dput:
+        dput(sdentry);
+out:
+        return error;
+}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 874972d9427c..c8278f4046cb 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
-                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        }
        return p;
 }
@@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
-                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -292,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
        nfs_pageio_cond_complete(pgio, page->index);
-        ret = nfs_page_async_flush(pgio, page,
+        ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
-                        wbc->sync_mode == WB_SYNC_NONE ||
-                        wbc->nonblocking != 0);
        if (ret == -EAGAIN) {
                redirty_page_for_writepage(wbc, page);
                ret = 0;
@@ -394,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
                if (nfs_have_delegation(inode, FMODE_WRITE))
                        nfsi->change_attr++;
        }
+        set_bit(PG_MAPPED, &req->wb_flags);
        SetPagePrivate(req->wb_page);
        set_page_private(req->wb_page, (unsigned long)req);
        nfsi->npages++;
@@ -419,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        spin_lock(&inode->i_lock);
        set_page_private(req->wb_page, 0);
        ClearPagePrivate(req->wb_page);
+        clear_bit(PG_MAPPED, &req->wb_flags);
        radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
        nfsi->npages--;
        if (!nfsi->npages) {
@@ -426,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
                iput(inode);
        } else
                spin_unlock(&inode->i_lock);
-        nfs_clear_request(req);
        nfs_release_request(req);
 }
@@ -935,7 +932,7 @@ out_bad:
        while (!list_empty(&list)) {
                data = list_entry(list.next, struct nfs_write_data, pages);
                list_del(&data->pages);
-                nfs_writedata_release(data);
+                nfs_writedata_free(data);
        }
        nfs_redirty_request(req);
        return -ENOMEM;
@@ -1433,15 +1430,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
        int flags = FLUSH_SYNC;
        int ret = 0;
-        /* Don't commit yet if this is a non-blocking flush and there are
+        if (wbc->sync_mode == WB_SYNC_NONE) {
-         * lots of outstanding writes for this mapping.
+                /* Don't commit yet if this is a non-blocking flush and there
-         */
+                 * are a lot of outstanding writes for this mapping.
-        if (wbc->sync_mode == WB_SYNC_NONE &&
+                 */
-            nfsi->ncommit <= (nfsi->npages >> 1))
+                if (nfsi->ncommit <= (nfsi->npages >> 1))
-                goto out_mark_dirty;
+                        goto out_mark_dirty;
-        if (wbc->nonblocking || wbc->for_background)
+                /* don't wait for the COMMIT response */
                flags = 0;
+        }
        ret = nfs_commit_inode(inode, flags);
        if (ret >= 0) {
                if (wbc->sync_mode == WB_SYNC_NONE) {
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index fc1c52571c03..84c27d69d421 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,6 +42,11 @@ struct nfsacl_encode_desc {
        gid_t gid;
 };
+struct nfsacl_simple_acl {
+        struct posix_acl acl;
+        struct posix_acl_entry ace[4];
+};
 static int
 xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 {
@@ -72,9 +77,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
        return 0;
 }
-unsigned int
+/**
-nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+ * nfsacl_encode - Encode an NFSv3 ACL
-              struct posix_acl *acl, int encode_entries, int typeflag)
+ *
+ * @buf: destination xdr_buf to contain XDR encoded ACL
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Returns size of encoded ACL in bytes or a negative errno value.
+ */
+int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+                  struct posix_acl *acl, int encode_entries, int typeflag)
 {
        int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
        struct nfsacl_encode_desc nfsacl_desc = {
@@ -88,17 +104,22 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
                .uid = inode->i_uid,
                .gid = inode->i_gid,
        };
+        struct nfsacl_simple_acl aclbuf;
        int err;
-        struct posix_acl *acl2 = NULL;
        if (entries > NFS_ACL_MAX_ENTRIES ||
            xdr_encode_word(buf, base, entries))
                return -EINVAL;
        if (encode_entries && acl && acl->a_count == 3) {
-                /* Fake up an ACL_MASK entry. */
+                struct posix_acl *acl2 = &aclbuf.acl;
-                acl2 = posix_acl_alloc(4, GFP_KERNEL);
-                if (!acl2)
+                /* Avoid the use of posix_acl_alloc().  nfsacl_encode() is
-                        return -ENOMEM;
+                 * invoked in contexts where a memory allocation failure is
+                 * fatal.  Fortunately this fake ACL is small enough to
+                 * construct on the stack. */
+                memset(acl2, 0, sizeof(acl2));
+                posix_acl_init(acl2, 4);
                /* Insert entries in canonical order: other orders seem
                 to confuse Solaris VxFS. */
                acl2->a_entries[0] = acl->a_entries[0];  /* ACL_USER_OBJ */
@@ -109,8 +130,6 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
                nfsacl_desc.acl = acl2;
        }
        err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
-        if (acl2)
-                posix_acl_release(acl2);
        if (!err)
                err = 8 + nfsacl_desc.desc.elem_size *
                          nfsacl_desc.desc.array_len;
@@ -224,9 +243,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
        return 0;
 }
-unsigned int
+/**
-nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+ * nfsacl_decode - Decode an NFSv3 ACL
-              struct posix_acl **pacl)
+ *
+ * @buf: xdr_buf containing XDR'd ACL data to decode
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @aclcnt: count of ACEs in decoded posix_acl
+ * @pacl: buffer in which to place decoded posix_acl
+ *
+ * Returns the length of the decoded ACL in bytes, or a negative errno value.
+ */
+int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+                  struct posix_acl **pacl)
 {
        struct nfsacl_decode_desc nfsacl_desc = {
                .desc = {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 4264377552e2..18b3e8975fe0 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -28,6 +28,18 @@ config NFSD
          If unsure, say N.
+config NFSD_DEPRECATED
+        bool "Include support for deprecated syscall interface to NFSD"
+        depends on NFSD
+        default y
+        help
+          The syscall interface to nfsd was obsoleted in 2.6.0 by a new
+          filesystem based interface.  The old interface is due for removal
+          in 2.6.40.  If you wish to remove the interface before then
+          say N.
+          In unsure, say Y.
 config NFSD_V2_ACL
        bool
        depends on NFSD
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 000000000000..34e5c40af5ef
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
+/*
+ *  Common NFSv4 ACL handling definitions.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LINUX_NFS4_ACL_H
+#define LINUX_NFS4_ACL_H
+#include <linux/posix_acl.h>
+/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
+ * fit in a page: */
+#define NFS4_ACL_MAX 170
+struct nfs4_acl *nfs4_acl_new(int);
+int nfs4_acl_get_whotype(char *, u32);
+int nfs4_acl_write_who(int who, char *p);
+int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
+                                        uid_t who, u32 mask);
+#define NFS4_ACL_TYPE_DEFAULT   0x01
+#define NFS4_ACL_DIR            0x02
+#define NFS4_ACL_OWNER          0x04
+struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
+                                struct posix_acl *, unsigned int flags);
+int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
+                                struct posix_acl **, unsigned int flags);
+#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c2a4f71d87dd..8b31e5f8795d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
-#define MSNFS   /* HACK HACK */
 /*
 * NFS exporting and validation.
 *
@@ -28,9 +27,6 @@
 typedef struct auth_domain      svc_client;
 typedef struct svc_export       svc_export;
-static void             exp_do_unexport(svc_export *unexp);
-static int              exp_verify_string(char *cp, int max);
 /*
 * We have two caches.
 * One maps client+vfsmnt+dentry to export options - the export map
@@ -802,6 +798,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
        return ek;
 }
+#ifdef CONFIG_NFSD_DEPRECATED
 static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
                       struct svc_export *exp)
 {
@@ -852,6 +849,7 @@ exp_get_fsid_key(svc_client *clp, int fsid)
        return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
+#endif
 static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
                                     struct cache_req *reqp)
@@ -893,6 +891,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path)
        return exp;
 }
+#ifdef CONFIG_NFSD_DEPRECATED
 /*
 * Hashtable locking. Write locks are placed only by user processes
 * wanting to modify export information.
@@ -925,6 +924,19 @@ exp_writeunlock(void)
 {
        up_write(&hash_sem);
 }
+#else
+/* hash_sem not needed once deprecated interface is removed */
+void exp_readlock(void) {}
+static inline void exp_writelock(void){}
+void exp_readunlock(void) {}
+static inline void exp_writeunlock(void){}
+#endif
+#ifdef CONFIG_NFSD_DEPRECATED
+static void             exp_do_unexport(svc_export *unexp);
+static int              exp_verify_string(char *cp, int max);
 static void exp_fsid_unhash(struct svc_export *exp)
 {
@@ -935,10 +947,9 @@ static void exp_fsid_unhash(struct svc_export *exp)
        ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
        if (!IS_ERR(ek)) {
-                ek->h.expiry_time = get_seconds()-1;
+                sunrpc_invalidate(&ek->h, &svc_expkey_cache);
                cache_put(&ek->h, &svc_expkey_cache);
        }
-        svc_expkey_cache.nextcheck = get_seconds();
 }
 static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
@@ -973,10 +984,9 @@ static void exp_unhash(struct svc_export *exp)
        ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
        if (!IS_ERR(ek)) {
-                ek->h.expiry_time = get_seconds()-1;
+                sunrpc_invalidate(&ek->h, &svc_expkey_cache);
                cache_put(&ek->h, &svc_expkey_cache);
        }
-        svc_expkey_cache.nextcheck = get_seconds();
 }
        
 /*
@@ -1097,8 +1107,7 @@ out:
 static void
 exp_do_unexport(svc_export *unexp)
 {
-        unexp->h.expiry_time = get_seconds()-1;
+        sunrpc_invalidate(&unexp->h, &svc_export_cache);
-        svc_export_cache.nextcheck = get_seconds();
        exp_unhash(unexp);
        exp_fsid_unhash(unexp);
 }
@@ -1150,6 +1159,7 @@ out_unlock:
        exp_writeunlock();
        return err;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 /*
 * Obtain the root fh on behalf of a client.
@@ -1433,9 +1443,6 @@ static struct flags {
        { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
        { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
        { NFSEXP_V4ROOT, {"v4root", ""}},
-#ifdef MSNFS
-        { NFSEXP_MSNFS, {"msnfs", ""}},
-#endif
        { 0, {"", ""}}
 };
@@ -1459,25 +1466,43 @@ static void show_secinfo_flags(struct seq_file *m, int flags)
        show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
 }
+static bool secinfo_flags_equal(int f, int g)
+{
+        f &= NFSEXP_SECINFO_FLAGS;
+        g &= NFSEXP_SECINFO_FLAGS;
+        return f == g;
+}
+static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
+{
+        int flags;
+        flags = (*fp)->flags;
+        seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
+        (*fp)++;
+        while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
+                seq_printf(m, ":%d", (*fp)->pseudoflavor);
+                (*fp)++;
+        }
+        return flags;
+}
 static void show_secinfo(struct seq_file *m, struct svc_export *exp)
 {
        struct exp_flavor_info *f;
        struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
-        int lastflags = 0, first = 0;
+        int flags;
        if (exp->ex_nflavors == 0)
                return;
-        for (f = exp->ex_flavors; f < end; f++) {
+        f = exp->ex_flavors;
-                if (first || f->flags != lastflags) {
+        flags = show_secinfo_run(m, &f, end);
-                        if (!first)
+        if (!secinfo_flags_equal(flags, exp->ex_flags))
-                                show_secinfo_flags(m, lastflags);
+                show_secinfo_flags(m, flags);
-                        seq_printf(m, ",sec=%d", f->pseudoflavor);
+        while (f != end) {
-                        lastflags = f->flags;
+                flags = show_secinfo_run(m, &f, end);
-                } else {
+                show_secinfo_flags(m, flags);
-                        seq_printf(m, ":%d", f->pseudoflavor);
-                }
        }
-        show_secinfo_flags(m, lastflags);
 }
 static void exp_flags(struct seq_file *m, int flag, int fsid,
@@ -1532,6 +1557,7 @@ const struct seq_operations nfs_exports_op = {
        .show   = e_show,
 };
+#ifdef CONFIG_NFSD_DEPRECATED
 /*
 * Add or modify a client.
 * Change requests may involve the list of host addresses. The list of
@@ -1563,7 +1589,7 @@ exp_addclient(struct nfsctl_client *ncp)
        /* Insert client into hashtable. */
        for (i = 0; i < ncp->cl_naddr; i++) {
                ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
-                auth_unix_add_addr(&addr6, dom);
+                auth_unix_add_addr(&init_net, &addr6, dom);
        }
        auth_unix_forget_old(dom);
        auth_domain_put(dom);
@@ -1621,6 +1647,7 @@ exp_verify_string(char *cp, int max)
        printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
        return 0;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 /*
 * Initialize the exports module.
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 000000000000..2f3be1321534
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
+/*
+ *  Mapping of UID to name and vice versa.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of
+ *  Michigan.  All rights reserved.
+> *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LINUX_NFSD_IDMAP_H
+#define LINUX_NFSD_IDMAP_H
+#include <linux/in.h>
+#include <linux/sunrpc/svc.h>
+/* XXX from linux/nfs_idmap.h */
+#define IDMAP_NAMESZ 128
+#ifdef CONFIG_NFSD_V4
+int nfsd_idmap_init(void);
+void nfsd_idmap_shutdown(void);
+#else
+static inline int nfsd_idmap_init(void)
+{
+        return 0;
+}
+static inline void nfsd_idmap_shutdown(void)
+{
+}
+#endif
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
+int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
+int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
+#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b7e3021e06b..2247fc91d5e9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
        __be32  nfserr;
        u32     max_blocksize = svc_max_payload(rqstp);
-        dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
+        dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
                                SVCFH_fmt(&argp->fh),
                                (unsigned long) argp->count,
-                                (unsigned long) argp->offset);
+                                (unsigned long long) argp->offset);
        /* Obtain buffer pointer for payload.
         * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
        __be32  nfserr;
        unsigned long cnt = argp->len;
-        dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
+        dprintk("nfsd: WRITE(3)    %s %d bytes at %Lu%s\n",
                                SVCFH_fmt(&argp->fh),
                                argp->len,
-                                (unsigned long) argp->offset,
+                                (unsigned long long) argp->offset,
                                argp->stable? " stable" : "");
        fh_copy(&resp->fh, &argp->fh);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2a533a0af2a9..7e84a852cdae 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp)
        err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
                        &fhp->fh_post_attr);
        fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
-        if (err)
+        if (err) {
                fhp->fh_post_saved = 0;
-        else
+                /* Grab the ctime anyway - set_change_info might use it */
+                fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
+        } else
                fhp->fh_post_saved = 1;
 }
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e48052615159..ad88f1c0a4c3 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs4_acl.h>
+#include "acl.h"
 /* mode bit translations: */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 988cbb3a19b6..3be975e18919 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -41,7 +41,6 @@
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
-#define NFS4_STATEID_SIZE 16
 /* Index of predefined Linux callback client operations */
@@ -51,11 +50,6 @@ enum {
        NFSPROC4_CLNT_CB_SEQUENCE,
 };
-enum nfs_cb_opnum4 {
-        OP_CB_RECALL            = 4,
-        OP_CB_SEQUENCE          = 11,
-};
 #define NFS4_MAXTAGLEN          20
 #define NFS4_enc_cb_null_sz             0
@@ -80,61 +74,6 @@ enum nfs_cb_opnum4 {
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
-/*
-* Generic encode routines from fs/nfs/nfs4xdr.c
-*/
-static inline __be32 *
-xdr_writemem(__be32 *p, const void *ptr, int nbytes)
-{
-        int tmp = XDR_QUADLEN(nbytes);
-        if (!tmp)
-                return p;
-        p[tmp-1] = 0;
-        memcpy(p, ptr, nbytes);
-        return p + tmp;
-}
-#define WRITE32(n)               *p++ = htonl(n)
-#define WRITEMEM(ptr,nbytes)     do {                           \
-        p = xdr_writemem(p, ptr, nbytes);                       \
-} while (0)
-#define RESERVE_SPACE(nbytes)   do {                            \
-        p = xdr_reserve_space(xdr, nbytes);                     \
-        if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \
-        BUG_ON(!p);                                             \
-} while (0)
-/*
- * Generic decode routines from fs/nfs/nfs4xdr.c
- */
-#define DECODE_TAIL                             \
-        status = 0;                             \
-out:                                            \
-        return status;                          \
-xdr_error:                                      \
-        dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
-        status = -EIO;                          \
-        goto out
-#define READ32(x)         (x) = ntohl(*p++)
-#define READ64(x)         do {                  \
-        (x) = (u64)ntohl(*p++) << 32;           \
-        (x) |= ntohl(*p++);                     \
-} while (0)
-#define READTIME(x)       do {                  \
-        p++;                                    \
-        (x.tv_sec) = ntohl(*p++);               \
-        (x.tv_nsec) = ntohl(*p++);              \
-} while (0)
-#define READ_BUF(nbytes)  do { \
-        p = xdr_inline_decode(xdr, nbytes); \
-        if (!p) { \
-                dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
-                        __func__, __LINE__); \
-                return -EIO; \
-        } \
-} while (0)
 struct nfs4_cb_compound_hdr {
        /* args */
        u32             ident;  /* minorversion 0 only */
@@ -145,294 +84,513 @@ struct nfs4_cb_compound_hdr {
        int             status;
 };
-static struct {
+/*
-int stat;
+ * Handle decode buffer overflows out-of-line.
-int errno;
+ */
-} nfs_cb_errtbl[] = {
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-        { NFS4_OK,              0               },
+{
-        { NFS4ERR_PERM,         EPERM           },
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-        { NFS4ERR_NOENT,        ENOENT          },
+                "Remaining buffer length is %tu words.\n",
-        { NFS4ERR_IO,           EIO             },
+                func, xdr->end - xdr->p);
-        { NFS4ERR_NXIO,         ENXIO           },
+}
-        { NFS4ERR_ACCESS,       EACCES          },
-        { NFS4ERR_EXIST,        EEXIST          },
-        { NFS4ERR_XDEV,         EXDEV           },
-        { NFS4ERR_NOTDIR,       ENOTDIR         },
-        { NFS4ERR_ISDIR,        EISDIR          },
-        { NFS4ERR_INVAL,        EINVAL          },
-        { NFS4ERR_FBIG,         EFBIG           },
-        { NFS4ERR_NOSPC,        ENOSPC          },
-        { NFS4ERR_ROFS,         EROFS           },
-        { NFS4ERR_MLINK,        EMLINK          },
-        { NFS4ERR_NAMETOOLONG,  ENAMETOOLONG    },
-        { NFS4ERR_NOTEMPTY,     ENOTEMPTY       },
-        { NFS4ERR_DQUOT,        EDQUOT          },
-        { NFS4ERR_STALE,        ESTALE          },
-        { NFS4ERR_BADHANDLE,    EBADHANDLE      },
-        { NFS4ERR_BAD_COOKIE,   EBADCOOKIE      },
-        { NFS4ERR_NOTSUPP,      ENOTSUPP        },
-        { NFS4ERR_TOOSMALL,     ETOOSMALL       },
-        { NFS4ERR_SERVERFAULT,  ESERVERFAULT    },
-        { NFS4ERR_BADTYPE,      EBADTYPE        },
-        { NFS4ERR_LOCKED,       EAGAIN          },
-        { NFS4ERR_RESOURCE,     EREMOTEIO       },
-        { NFS4ERR_SYMLINK,      ELOOP           },
-        { NFS4ERR_OP_ILLEGAL,   EOPNOTSUPP      },
-        { NFS4ERR_DEADLOCK,     EDEADLK         },
-        { -1,                   EIO             }
-};
-static int
+static __be32 *xdr_encode_empty_array(__be32 *p)
-nfs_cb_stat_to_errno(int stat)
 {
-        int i;
+        *p++ = xdr_zero;
-        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+        return p;
-                if (nfs_cb_errtbl[i].stat == stat)
-                        return nfs_cb_errtbl[i].errno;
-        }
-        /* If we cannot translate the error, the recovery routines should
-        * handle it.
-        * Note: remaining NFSv4 error codes have values > 10000, so should
-        * not conflict with native Linux error codes.
-        */
-        return stat;
 }
 /*
- * XDR encode
+ * Encode/decode NFSv4 CB basic data types
+ *
+ * Basic NFSv4 callback data types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section
+ * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
+ * 1 Protocol"
 */
-static void
+/*
-encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+ *      nfs_cb_opnum4
+ *
+ *      enum nfs_cb_opnum4 {
+ *              OP_CB_GETATTR           = 3,
+ *                ...
+ *      };
+ */
+enum nfs_cb_opnum4 {
+        OP_CB_GETATTR                   = 3,
+        OP_CB_RECALL                    = 4,
+        OP_CB_LAYOUTRECALL              = 5,
+        OP_CB_NOTIFY                    = 6,
+        OP_CB_PUSH_DELEG                = 7,
+        OP_CB_RECALL_ANY                = 8,
+        OP_CB_RECALLABLE_OBJ_AVAIL      = 9,
+        OP_CB_RECALL_SLOT               = 10,
+        OP_CB_SEQUENCE                  = 11,
+        OP_CB_WANTS_CANCELLED           = 12,
+        OP_CB_NOTIFY_LOCK               = 13,
+        OP_CB_NOTIFY_DEVICEID           = 14,
+        OP_CB_ILLEGAL                   = 10044
+};
+static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
 {
        __be32 *p;
-        RESERVE_SPACE(sizeof(stateid_t));
+        p = xdr_reserve_space(xdr, 4);
-        WRITE32(sid->si_generation);
+        *p = cpu_to_be32(op);
-        WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
 }
-static void
+/*
-encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
+ * nfs_fh4
+ *
+ *      typedef opaque nfs_fh4<NFS4_FHSIZE>;
+ */
+static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
 {
-        __be32 * p;
+        u32 length = fh->fh_size;
+        __be32 *p;
-        RESERVE_SPACE(16);
+        BUG_ON(length > NFS4_FHSIZE);
-        WRITE32(0);            /* tag length is always 0 */
+        p = xdr_reserve_space(xdr, 4 + length);
-        WRITE32(hdr->minorversion);
+        xdr_encode_opaque(p, &fh->fh_base, length);
-        WRITE32(hdr->ident);
-        hdr->nops_p = p;
-        WRITE32(hdr->nops);
 }
-static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+/*
+ * stateid4
+ *
+ *      struct stateid4 {
+ *              uint32_t        seqid;
+ *              opaque          other[12];
+ *      };
+ */
+static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
 {
-        *hdr->nops_p = htonl(hdr->nops);
+        __be32 *p;
+        p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(sid->si_generation);
+        xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
 }
-static void
+/*
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+ * sessionid4
-                struct nfs4_cb_compound_hdr *hdr)
+ *
+ *      typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
+ */
+static void encode_sessionid4(struct xdr_stream *xdr,
+                              const struct nfsd4_session *session)
 {
        __be32 *p;
-        int len = dp->dl_fh.fh_size;
+        p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
-        RESERVE_SPACE(4);
+        xdr_encode_opaque_fixed(p, session->se_sessionid.data,
-        WRITE32(OP_CB_RECALL);
+                                        NFS4_MAX_SESSIONID_LEN);
-        encode_stateid(xdr, &dp->dl_stateid);
-        RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
-        WRITE32(0); /* truncate optimization not implemented */
-        WRITE32(len);
-        WRITEMEM(&dp->dl_fh.fh_base, len);
-        hdr->nops++;
 }
-static void
+/*
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
+ * nfsstat4
-                   struct nfs4_cb_compound_hdr *hdr)
+ */
-{
+static const struct {
-        __be32 *p;
+        int stat;
+        int errno;
+} nfs_cb_errtbl[] = {
+        { NFS4_OK,              0               },
+        { NFS4ERR_PERM,         -EPERM          },
+        { NFS4ERR_NOENT,        -ENOENT         },
+        { NFS4ERR_IO,           -EIO            },
+        { NFS4ERR_NXIO,         -ENXIO          },
+        { NFS4ERR_ACCESS,       -EACCES         },
+        { NFS4ERR_EXIST,        -EEXIST         },
+        { NFS4ERR_XDEV,         -EXDEV          },
+        { NFS4ERR_NOTDIR,       -ENOTDIR        },
+        { NFS4ERR_ISDIR,        -EISDIR         },
+        { NFS4ERR_INVAL,        -EINVAL         },
+        { NFS4ERR_FBIG,         -EFBIG          },
+        { NFS4ERR_NOSPC,        -ENOSPC         },
+        { NFS4ERR_ROFS,         -EROFS          },
+        { NFS4ERR_MLINK,        -EMLINK         },
+        { NFS4ERR_NAMETOOLONG,  -ENAMETOOLONG   },
+        { NFS4ERR_NOTEMPTY,     -ENOTEMPTY      },
+        { NFS4ERR_DQUOT,        -EDQUOT         },
+        { NFS4ERR_STALE,        -ESTALE         },
+        { NFS4ERR_BADHANDLE,    -EBADHANDLE     },
+        { NFS4ERR_BAD_COOKIE,   -EBADCOOKIE     },
+        { NFS4ERR_NOTSUPP,      -ENOTSUPP       },
+        { NFS4ERR_TOOSMALL,     -ETOOSMALL      },
+        { NFS4ERR_SERVERFAULT,  -ESERVERFAULT   },
+        { NFS4ERR_BADTYPE,      -EBADTYPE       },
+        { NFS4ERR_LOCKED,       -EAGAIN         },
+        { NFS4ERR_RESOURCE,     -EREMOTEIO      },
+        { NFS4ERR_SYMLINK,      -ELOOP          },
+        { NFS4ERR_OP_ILLEGAL,   -EOPNOTSUPP     },
+        { NFS4ERR_DEADLOCK,     -EDEADLK        },
+        { -1,                   -EIO            }
+};
-        if (hdr->minorversion == 0)
+/*
-                return;
+ * If we cannot translate the error, the recovery routines should
+ * handle it.
+ *
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+static int nfs_cb_stat_to_errno(int status)
+{
+        int i;
-        RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
+        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+                if (nfs_cb_errtbl[i].stat == status)
+                        return nfs_cb_errtbl[i].errno;
+        }
-        WRITE32(OP_CB_SEQUENCE);
+        dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
-        WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        return -status;
-        WRITE32(args->cbs_clp->cl_cb_seq_nr);
-        WRITE32(0);             /* slotid, always 0 */
-        WRITE32(0);             /* highest slotid always 0 */
-        WRITE32(0);             /* cachethis always 0 */
-        WRITE32(0); /* FIXME: support referring_call_lists */
-        hdr->nops++;
 }
-static int
+static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
-nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
+                               enum nfsstat4 *status)
 {
-        struct xdr_stream xdrs, *xdr = &xdrs;
+        __be32 *p;
+        u32 op;
-        xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
+        p = xdr_inline_decode(xdr, 4 + 4);
-        RESERVE_SPACE(0);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        op = be32_to_cpup(p++);
+        if (unlikely(op != expected))
+                goto out_unexpected;
+        *status = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+out_unexpected:
+        dprintk("NFSD: Callback server returned operation %d but "
+                "we issued a request for %d\n", op, expected);
+        return -EIO;
 }
-static int
+/*
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
+ * CB_COMPOUND4args
-                struct nfs4_rpc_args *rpc_args)
+ *
+ *      struct CB_COMPOUND4args {
+ *              utf8str_cs      tag;
+ *              uint32_t        minorversion;
+ *              uint32_t        callback_ident;
+ *              nfs_cb_argop4   argarray<>;
+ *      };
+*/
+static void encode_cb_compound4args(struct xdr_stream *xdr,
+                                    struct nfs4_cb_compound_hdr *hdr)
 {
-        struct xdr_stream xdr;
+        __be32 * p;
-        struct nfs4_delegation *args = rpc_args->args_op;
-        struct nfs4_cb_compound_hdr hdr = {
-                .ident = args->dl_ident,
-                .minorversion = rpc_args->args_seq.cbs_minorversion,
-        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
-        encode_cb_compound_hdr(&xdr, &hdr);
+        p = xdr_encode_empty_array(p);          /* empty tag */
-        encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
+        *p++ = cpu_to_be32(hdr->minorversion);
-        encode_cb_recall(&xdr, args, &hdr);
+        *p++ = cpu_to_be32(hdr->ident);
-        encode_cb_nops(&hdr);
+        hdr->nops_p = p;
+        *p = cpu_to_be32(hdr->nops);            /* argarray element count */
+}
+/*
+ * Update argarray element count
+ */
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+{
+        BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
+        *hdr->nops_p = cpu_to_be32(hdr->nops);
+}
+/*
+ * CB_COMPOUND4res
+ *
+ *      struct CB_COMPOUND4res {
+ *              nfsstat4        status;
+ *              utf8str_cs      tag;
+ *              nfs_cb_resop4   resarray<>;
+ *      };
+ */
+static int decode_cb_compound4res(struct xdr_stream *xdr,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        hdr->status = be32_to_cpup(p++);
+        /* Ignore the tag */
+        length = be32_to_cpup(p++);
+        p = xdr_inline_decode(xdr, length + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        hdr->nops = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
+/*
+ * CB_RECALL4args
+ *
+ *      struct CB_RECALL4args {
+ *              stateid4        stateid;
+ *              bool            truncate;
+ *              nfs_fh4         fh;
+ *      };
+ */
+static void encode_cb_recall4args(struct xdr_stream *xdr,
+                                  const struct nfs4_delegation *dp,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        __be32 *p;
+        encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
+        encode_stateid4(xdr, &dp->dl_stateid);
-static int
+        p = xdr_reserve_space(xdr, 4);
-decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
+        *p++ = xdr_zero;                        /* truncate */
-        __be32 *p;
-        u32 taglen;
-        READ_BUF(8);
+        encode_nfs_fh4(xdr, &dp->dl_fh);
-        READ32(hdr->status);
-        /* We've got no use for the tag; ignore it: */
+        hdr->nops++;
-        READ32(taglen);
-        READ_BUF(taglen + 4);
-        p += XDR_QUADLEN(taglen);
-        READ32(hdr->nops);
-        return 0;
 }
-static int
+/*
-decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+ * CB_SEQUENCE4args
+ *
+ *      struct CB_SEQUENCE4args {
+ *              sessionid4              csa_sessionid;
+ *              sequenceid4             csa_sequenceid;
+ *              slotid4                 csa_slotid;
+ *              slotid4                 csa_highest_slotid;
+ *              bool                    csa_cachethis;
+ *              referring_call_list4    csa_referring_call_lists<>;
+ *      };
+ */
+static void encode_cb_sequence4args(struct xdr_stream *xdr,
+                                    const struct nfsd4_callback *cb,
+                                    struct nfs4_cb_compound_hdr *hdr)
 {
+        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
        __be32 *p;
-        u32 op;
-        int32_t nfserr;
+        if (hdr->minorversion == 0)
+                return;
-        READ_BUF(8);
-        READ32(op);
+        encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
-        if (op != expected) {
+        encode_sessionid4(xdr, session);
-                dprintk("NFSD: decode_cb_op_hdr: Callback server returned "
-                         " operation %d but we issued a request for %d\n",
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
-                         op, expected);
+        *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
-                return -EIO;
+        *p++ = xdr_zero;                        /* csa_slotid */
-        }
+        *p++ = xdr_zero;                        /* csa_highest_slotid */
-        READ32(nfserr);
+        *p++ = xdr_zero;                        /* csa_cachethis */
-        if (nfserr != NFS_OK)
+        xdr_encode_empty_array(p);              /* csa_referring_call_lists */
-                return -nfs_cb_stat_to_errno(nfserr);
-        return 0;
+        hdr->nops++;
 }
 /*
+ * CB_SEQUENCE4resok
+ *
+ *      struct CB_SEQUENCE4resok {
+ *              sessionid4      csr_sessionid;
+ *              sequenceid4     csr_sequenceid;
+ *              slotid4         csr_slotid;
+ *              slotid4         csr_highest_slotid;
+ *              slotid4         csr_target_highest_slotid;
+ *      };
+ *
+ *      union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
+ *      case NFS4_OK:
+ *              CB_SEQUENCE4resok       csr_resok4;
+ *      default:
+ *              void;
+ *      };
+ *
 * Our current back channel implmentation supports a single backchannel
 * with a single slot.
 */
-static int
+static int decode_cb_sequence4resok(struct xdr_stream *xdr,
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
+                                    struct nfsd4_callback *cb)
-                   struct rpc_rqst *rqstp)
 {
+        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
        struct nfs4_sessionid id;
        int status;
-        u32 dummy;
        __be32 *p;
+        u32 dummy;
-        if (res->cbs_minorversion == 0)
+        status = -ESERVERFAULT;
-                return 0;
-        status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
-        if (status)
-                return status;
        /*
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
         */
-        status = -ESERVERFAULT;
+        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+        if (unlikely(p == NULL))
-        READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+                goto out_overflow;
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+        if (memcmp(id.data, session->se_sessionid.data,
-        if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
+                                        NFS4_MAX_SESSIONID_LEN) != 0) {
-                   NFS4_MAX_SESSIONID_LEN)) {
+                dprintk("NFS: %s Invalid session id\n", __func__);
-                dprintk("%s Invalid session id\n", __func__);
                goto out;
        }
-        READ32(dummy);
+        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-        if (dummy != res->cbs_clp->cl_cb_seq_nr) {
-                dprintk("%s Invalid sequence number\n", __func__);
+        dummy = be32_to_cpup(p++);
+        if (dummy != session->se_cb_seq_nr) {
+                dprintk("NFS: %s Invalid sequence number\n", __func__);
                goto out;
        }
-        READ32(dummy);  /* slotid must be 0 */
+        dummy = be32_to_cpup(p++);
        if (dummy != 0) {
-                dprintk("%s Invalid slotid\n", __func__);
+                dprintk("NFS: %s Invalid slotid\n", __func__);
                goto out;
        }
-        /* FIXME: process highest slotid and target highest slotid */
+        /*
+         * FIXME: process highest slotid and target highest slotid
+         */
        status = 0;
 out:
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_cb_sequence4res(struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
+{
+        enum nfsstat4 nfserr;
+        int status;
+        if (cb->cb_minorversion == 0)
+                return 0;
+        status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                goto out_default;
+        status = decode_cb_sequence4resok(xdr, cb);
+out:
+        return status;
+out_default:
+        return nfs_cb_stat_to_errno(status);
+}
+/*
+ * NFSv4.0 and NFSv4.1 XDR encode functions
+ *
+ * NFSv4.0 callback argument types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+/*
+ * NB: Without this zero space reservation, callbacks over krb5p fail
+ */
+static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 void *__unused)
+{
+        xdr_reserve_space(xdr, 0);
+}
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                   const struct nfsd4_callback *cb)
+{
+        const struct nfs4_delegation *args = cb->cb_op;
+        struct nfs4_cb_compound_hdr hdr = {
+                .ident = cb->cb_clp->cl_cb_ident,
+                .minorversion = cb->cb_minorversion,
+        };
+        encode_cb_compound4args(xdr, &hdr);
+        encode_cb_sequence4args(xdr, cb, &hdr);
+        encode_cb_recall4args(xdr, args, &hdr);
+        encode_cb_nops(&hdr);
 }
-static int
+/*
-nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
+ * NFSv4.0 and NFSv4.1 XDR decode functions
+ *
+ * NFSv4.0 callback result types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                void *__unused)
 {
        return 0;
 }
-static int
+/*
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
-                struct nfsd4_cb_sequence *seq)
+ */
+static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
 {
-        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr;
+        enum nfsstat4 nfserr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_cb_compound4res(xdr, &hdr);
-        status = decode_cb_compound_hdr(&xdr, &hdr);
+        if (unlikely(status))
-        if (status)
                goto out;
-        if (seq) {
-                status = decode_cb_sequence(&xdr, seq, rqstp);
+        if (cb != NULL) {
-                if (status)
+                status = decode_cb_sequence4res(xdr, cb);
+                if (unlikely(status))
                        goto out;
        }
-        status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
+        status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                goto out_default;
 out:
        return status;
+out_default:
+        return nfs_cb_stat_to_errno(status);
 }
 /*
 * RPC procedure tables
 */
-#define PROC(proc, call, argtype, restype)                              \
+#define PROC(proc, call, argtype, restype)                              \
-[NFSPROC4_CLNT_##proc] = {                                              \
+[NFSPROC4_CLNT_##proc] = {                                              \
-        .p_proc   = NFSPROC4_CB_##call,                                 \
+        .p_proc    = NFSPROC4_CB_##call,                                \
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
+        .p_encode  = (kxdreproc_t)nfs4_xdr_enc_##argtype,               \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
+        .p_decode  = (kxdrdproc_t)nfs4_xdr_dec_##restype,               \
-        .p_arglen = NFS4_##argtype##_sz,                                \
+        .p_arglen  = NFS4_enc_##argtype##_sz,                           \
-        .p_replen = NFS4_##restype##_sz,                                \
+        .p_replen  = NFS4_dec_##restype##_sz,                           \
-        .p_statidx = NFSPROC4_CB_##call,                                \
+        .p_statidx = NFSPROC4_CB_##call,                                \
-        .p_name   = #proc,                                              \
+        .p_name    = #proc,                                             \
-}
+}
-static struct rpc_procinfo     nfs4_cb_procedures[] = {
+static struct rpc_procinfo nfs4_cb_procedures[] = {
-    PROC(CB_NULL,      NULL,     enc_cb_null,     dec_cb_null),
+        PROC(CB_NULL,   NULL,           cb_null,        cb_null),
-    PROC(CB_RECALL,    COMPOUND,   enc_cb_recall,      dec_cb_recall),
+        PROC(CB_RECALL, COMPOUND,       cb_recall,      cb_recall),
 };
-static struct rpc_version       nfs_cb_version4 = {
+static struct rpc_version nfs_cb_version4 = {
 /*
 * Note on the callback rpc program version number: despite language in rfc
 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
@@ -440,29 +598,29 @@ static struct rpc_version       nfs_cb_version4 = {
 * in practice that appears to be what implementations use.  The section
 * 18.36.3 language is expected to be fixed in an erratum.
 */
-        .number                 = 1,
+        .number                 = 1,
-        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
+        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
-        .procs                  = nfs4_cb_procedures
+        .procs                  = nfs4_cb_procedures
 };
-static struct rpc_version *     nfs_cb_version[] = {
+static struct rpc_version *nfs_cb_version[] = {
        &nfs_cb_version4,
 };
 static struct rpc_program cb_program;
 static struct rpc_stat cb_stats = {
-                .program        = &cb_program
+        .program                = &cb_program
 };
 #define NFS4_CALLBACK 0x40000000
 static struct rpc_program cb_program = {
-                .name           = "nfs4_cb",
+        .name                   = "nfs4_cb",
-                .number         = NFS4_CALLBACK,
+        .number                 = NFS4_CALLBACK,
-                .nrvers         = ARRAY_SIZE(nfs_cb_version),
+        .nrvers                 = ARRAY_SIZE(nfs_cb_version),
-                .version        = nfs_cb_version,
+        .version                = nfs_cb_version,
-                .stats          = &cb_stats,
+        .stats                  = &cb_stats,
-                .pipe_dir_name  = "/nfsd4_cb",
+        .pipe_dir_name          = "/nfsd4_cb",
 };
 static int max_cb_time(void)
@@ -470,33 +628,40 @@ static int max_cb_time(void)
        return max(nfsd4_lease/10, (time_t)1) * HZ;
 }
-/* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cl_cb_set an atomic? */
-int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
+static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
 {
        struct rpc_timeout      timeparms = {
                .to_initval     = max_cb_time(),
                .to_retries     = 0,
        };
        struct rpc_create_args args = {
-                .protocol       = XPRT_TRANSPORT_TCP,
+                .net            = &init_net,
-                .address        = (struct sockaddr *) &cb->cb_addr,
+                .address        = (struct sockaddr *) &conn->cb_addr,
-                .addrsize       = cb->cb_addrlen,
+                .addrsize       = conn->cb_addrlen,
+                .saddress       = (struct sockaddr *) &conn->cb_saddr,
                .timeout        = &timeparms,
                .program        = &cb_program,
-                .prognumber     = cb->cb_prog,
                .version        = 0,
                .authflavor     = clp->cl_flavor,
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
-                .client_name    = clp->cl_principal,
        };
        struct rpc_clnt *client;
-        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+        if (clp->cl_minorversion == 0) {
-                return -EINVAL;
+                if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
-        if (cb->cb_minorversion) {
+                        return -EINVAL;
-                args.bc_xprt = cb->cb_xprt;
+                args.client_name = clp->cl_principal;
+                args.prognumber = conn->cb_prog,
+                args.protocol = XPRT_TRANSPORT_TCP;
+                clp->cl_cb_ident = conn->cb_ident;
+        } else {
+                if (!conn->cb_xprt)
+                        return -EINVAL;
+                clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
+                clp->cl_cb_session = ses;
+                args.bc_xprt = conn->cb_xprt;
+                args.prognumber = clp->cl_cb_session->se_cb_prog;
                args.protocol = XPRT_TRANSPORT_BC_TCP;
        }
        /* Create RPC client */
@@ -506,7 +671,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
                        PTR_ERR(client));
                return PTR_ERR(client);
        }
-        nfsd4_set_callback_client(clp, client);
+        clp->cl_cb_client = client;
        return 0;
 }
@@ -517,17 +682,25 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
                (int)clp->cl_name.len, clp->cl_name.data, reason);
 }
+static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+{
+        clp->cl_cb_state = NFSD4_CB_DOWN;
+        warn_no_callback_path(clp, reason);
+}
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_client *clp = calldata;
+        struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
        if (task->tk_status)
-                warn_no_callback_path(clp, task->tk_status);
+                nfsd4_mark_cb_down(clp, task->tk_status);
        else
-                atomic_set(&clp->cl_cb_set, 1);
+                clp->cl_cb_state = NFSD4_CB_UP;
 }
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+        /* XXX: release method to ensure we set the cb channel down if
+         * necessary on early failure? */
        .rpc_call_done = nfsd4_cb_probe_done,
 };
@@ -543,38 +716,54 @@ int set_callback_cred(void)
        return 0;
 }
+static struct workqueue_struct *callback_wq;
-void do_probe_callback(struct nfs4_client *clp)
+static void run_nfsd4_cb(struct nfsd4_callback *cb)
 {
-        struct rpc_message msg = {
+        queue_work(callback_wq, &cb->cb_work);
-                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+}
-                .rpc_argp       = clp,
-                .rpc_cred       = callback_cred
+static void do_probe_callback(struct nfs4_client *clp)
-        };
+{
-        int status;
+        struct nfsd4_callback *cb = &clp->cl_cb_null;
+        cb->cb_op = NULL;
+        cb->cb_clp = clp;
+        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
+        cb->cb_msg.rpc_argp = NULL;
+        cb->cb_msg.rpc_resp = NULL;
+        cb->cb_msg.rpc_cred = callback_cred;
-        status = rpc_call_async(clp->cl_cb_client, &msg,
+        cb->cb_ops = &nfsd4_cb_probe_ops;
-                                RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
-                                &nfsd4_cb_probe_ops, (void *)clp);
+        run_nfsd4_cb(cb);
-        if (status)
-                warn_no_callback_path(clp, status);
 }
 /*
- * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
+ * Poke the callback thread to process any updates to the callback
+ * parameters, and send a null probe.
 */
-void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
+void nfsd4_probe_callback(struct nfs4_client *clp)
 {
-        int status;
+        /* XXX: atomicity?  Also, should we be using cl_cb_flags? */
+        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+        set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+        do_probe_callback(clp);
+}
-        BUG_ON(atomic_read(&clp->cl_cb_set));
+void nfsd4_probe_callback_sync(struct nfs4_client *clp)
+{
+        nfsd4_probe_callback(clp);
+        flush_workqueue(callback_wq);
+}
-        status = setup_callback_client(clp, cb);
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
-        if (status) {
+{
-                warn_no_callback_path(clp, status);
+        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
-                return;
+        spin_lock(&clp->cl_lock);
-        }
+        memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
-        do_probe_callback(clp);
+        spin_unlock(&clp->cl_lock);
 }
 /*
@@ -582,33 +771,14 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 * If the slot is available, then mark it busy.  Otherwise, set the
 * thread for sleeping on the callback RPC wait queue.
 */
-static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
+static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
-                struct rpc_task *task)
 {
-        struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
-        u32 *ptr = (u32 *)clp->cl_sessionid.data;
-        int status = 0;
-        dprintk("%s: %u:%u:%u:%u\n", __func__,
-                ptr[0], ptr[1], ptr[2], ptr[3]);
        if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
                rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
                dprintk("%s slot is busy\n", __func__);
-                status = -EAGAIN;
+                return false;
-                goto out;
        }
+        return true;
-        /*
-         * We'll need the clp during XDR encoding and decoding,
-         * and the sequence during decoding to verify the reply
-         */
-        args->args_seq.cbs_clp = clp;
-        task->tk_msg.rpc_resp = &args->args_seq;
-out:
-        dprintk("%s status=%d\n", __func__, status);
-        return status;
 }
 /*
@@ -617,42 +787,42 @@ out:
 */
 static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
-        struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+        u32 minorversion = clp->cl_minorversion;
-        u32 minorversion = clp->cl_cb_conn.cb_minorversion;
-        int status = 0;
-        args->args_seq.cbs_minorversion = minorversion;
+        cb->cb_minorversion = minorversion;
        if (minorversion) {
-                status = nfsd41_cb_setup_sequence(clp, task);
+                if (!nfsd41_cb_get_slot(clp, task))
-                if (status) {
-                        if (status != -EAGAIN) {
-                                /* terminate rpc task */
-                                task->tk_status = status;
-                                task->tk_action = NULL;
-                        }
                        return;
-                }
        }
+        spin_lock(&clp->cl_lock);
+        if (list_empty(&cb->cb_per_client)) {
+                /* This is the first call, not a restart */
+                cb->cb_done = false;
+                list_add(&cb->cb_per_client, &clp->cl_callbacks);
+        }
+        spin_unlock(&clp->cl_lock);
        rpc_call_start(task);
 }
 static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
        dprintk("%s: minorversion=%d\n", __func__,
-                clp->cl_cb_conn.cb_minorversion);
+                clp->cl_minorversion);
-        if (clp->cl_cb_conn.cb_minorversion) {
+        if (clp->cl_minorversion) {
                /* No need for lock, access serialized in nfsd4_cb_prepare */
-                ++clp->cl_cb_seq_nr;
+                ++clp->cl_cb_session->se_cb_seq_nr;
                clear_bit(0, &clp->cl_cb_slot_busy);
                rpc_wake_up_next(&clp->cl_cb_waitq);
                dprintk("%s: freed slot, new seqid=%d\n", __func__,
-                        clp->cl_cb_seq_nr);
+                        clp->cl_cb_session->se_cb_seq_nr);
                /* We're done looking into the sequence information */
                task->tk_msg.rpc_resp = NULL;
@@ -662,21 +832,25 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
        struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
        nfsd4_cb_done(task, calldata);
-        if (current_rpc_client == NULL) {
+        if (current_rpc_client != task->tk_client) {
-                /* We're shutting down; give up. */
+                /* We're shutting down or changing cl_cb_client; leave
-                /* XXX: err, or is it ok just to fall through
+                 * it to nfsd4_process_cb_update to restart the call if
-                 * and rpc_restart_call? */
+                 * necessary. */
                return;
        }
+        if (cb->cb_done)
+                return;
        switch (task->tk_status) {
        case 0:
+                cb->cb_done = true;
                return;
        case -EBADHANDLE:
        case -NFS4ERR_BAD_STATEID:
@@ -685,31 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
                break;
        default:
                /* Network partition? */
-                atomic_set(&clp->cl_cb_set, 0);
+                nfsd4_mark_cb_down(clp, task->tk_status);
-                warn_no_callback_path(clp, task->tk_status);
-                if (current_rpc_client != task->tk_client) {
-                        /* queue a callback on the new connection: */
-                        atomic_inc(&dp->dl_count);
-                        nfsd4_cb_recall(dp);
-                        return;
-                }
        }
        if (dp->dl_retries--) {
                rpc_delay(task, 2*HZ);
                task->tk_status = 0;
                rpc_restart_call_prepare(task);
                return;
-        } else {
-                atomic_set(&clp->cl_cb_set, 0);
-                warn_no_callback_path(clp, task->tk_status);
        }
+        nfsd4_mark_cb_down(clp, task->tk_status);
+        cb->cb_done = true;
 }
 static void nfsd4_cb_recall_release(void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_client *clp = cb->cb_clp;
-        nfs4_put_delegation(dp);
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
+        if (cb->cb_done) {
+                spin_lock(&clp->cl_lock);
+                list_del(&cb->cb_per_client);
+                spin_unlock(&clp->cl_lock);
+                nfs4_put_delegation(dp);
+        }
 }
 static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -718,8 +891,6 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
        .rpc_release = nfsd4_cb_recall_release,
 };
-static struct workqueue_struct *callback_wq;
 int nfsd4_create_callback_queue(void)
 {
        callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
@@ -734,57 +905,124 @@ void nfsd4_destroy_callback_queue(void)
 }
 /* must be called under the state lock */
-void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
+void nfsd4_shutdown_callback(struct nfs4_client *clp)
 {
-        struct rpc_clnt *old = clp->cl_cb_client;
+        set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
-        clp->cl_cb_client = new;
        /*
-         * After this, any work that saw the old value of cl_cb_client will
+         * Note this won't actually result in a null callback;
-         * be gone:
+         * instead, nfsd4_do_callback_rpc() will detect the killed
+         * client, destroy the rpc client, and stop:
         */
+        do_probe_callback(clp);
        flush_workqueue(callback_wq);
-        /* So we can safely shut it down: */
-        if (old)
-                rpc_shutdown_client(old);
 }
-/*
+static void nfsd4_release_cb(struct nfsd4_callback *cb)
- * called with dp->dl_count inc'ed.
- */
-static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
-        struct nfs4_client *clp = dp->dl_client;
+        if (cb->cb_ops->rpc_release)
-        struct rpc_clnt *clnt = clp->cl_cb_client;
+                cb->cb_ops->rpc_release(cb);
-        struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
+}
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
-                .rpc_cred = callback_cred
-        };
-        if (clnt == NULL) {
+/* requires cl_lock: */
-                nfs4_put_delegation(dp);
+static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
-                return; /* Client is shutting down; give up. */
+{
+        struct nfsd4_session *s;
+        struct nfsd4_conn *c;
+        list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
+                list_for_each_entry(c, &s->se_conns, cn_persession) {
+                        if (c->cn_flags & NFS4_CDFC4_BACK)
+                                return c;
+                }
        }
+        return NULL;
+}
-        args->args_op = dp;
+static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
-        msg.rpc_argp = args;
+{
-        dp->dl_retries = 1;
+        struct nfs4_cb_conn conn;
-        rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp);
+        struct nfs4_client *clp = cb->cb_clp;
+        struct nfsd4_session *ses = NULL;
+        struct nfsd4_conn *c;
+        int err;
+        /*
+         * This is either an update, or the client dying; in either case,
+         * kill the old client:
+         */
+        if (clp->cl_cb_client) {
+                rpc_shutdown_client(clp->cl_cb_client);
+                clp->cl_cb_client = NULL;
+        }
+        if (clp->cl_cb_conn.cb_xprt) {
+                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+                clp->cl_cb_conn.cb_xprt = NULL;
+        }
+        if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
+                return;
+        spin_lock(&clp->cl_lock);
+        /*
+         * Only serialized callback code is allowed to clear these
+         * flags; main nfsd code can only set them:
+         */
+        BUG_ON(!clp->cl_cb_flags);
+        clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+        memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+        c = __nfsd4_find_backchannel(clp);
+        if (c) {
+                svc_xprt_get(c->cn_xprt);
+                conn.cb_xprt = c->cn_xprt;
+                ses = c->cn_session;
+        }
+        spin_unlock(&clp->cl_lock);
+        err = setup_callback_client(clp, &conn, ses);
+        if (err) {
+                warn_no_callback_path(clp, err);
+                return;
+        }
+        /* Yay, the callback channel's back! Restart any callbacks: */
+        list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
+                run_nfsd4_cb(cb);
 }
 void nfsd4_do_callback_rpc(struct work_struct *w)
 {
-        /* XXX: for now, just send off delegation recall. */
+        struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
-        /* In future, generalize to handle any sort of callback. */
+        struct nfs4_client *clp = cb->cb_clp;
-        struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
+        struct rpc_clnt *clnt;
-        struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
-        _nfsd4_cb_recall(dp);
+        if (clp->cl_cb_flags)
-}
+                nfsd4_process_cb_update(cb);
+        clnt = clp->cl_cb_client;
+        if (!clnt) {
+                /* Callback channel broken, or client killed; give up: */
+                nfsd4_release_cb(cb);
+                return;
+        }
+        rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+                        cb->cb_ops, cb);
+}
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
-        queue_work(callback_wq, &dp->dl_recall.cb_work);
+        struct nfsd4_callback *cb = &dp->dl_recall;
+        struct nfs4_client *clp = dp->dl_client;
+        dp->dl_retries = 1;
+        cb->cb_op = dp;
+        cb->cb_clp = clp;
+        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
+        cb->cb_msg.rpc_argp = cb;
+        cb->cb_msg.rpc_resp = cb;
+        cb->cb_msg.rpc_cred = callback_cred;
+        cb->cb_ops = &nfsd4_cb_recall_ops;
+        dp->dl_retries = 1;
+        INIT_LIST_HEAD(&cb->cb_per_client);
+        cb->cb_done = true;
+        run_nfsd4_cb(&dp->dl_recall);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index c78dbf493424..6d2c397d458b 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,11 @@
 */
 #include <linux/module.h>
-#include <linux/nfsd_idmap.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include "idmap.h"
+#include "nfsd.h"
 /*
 * Cache entry
@@ -482,109 +483,26 @@ nfsd_idmap_shutdown(void)
        cache_unregister(&nametoid_cache);
 }
-/*
- * Deferred request handling
- */
-struct idmap_defer_req {
-       struct cache_req         req;
-       struct cache_deferred_req deferred_req;
-       wait_queue_head_t        waitq;
-       atomic_t                 count;
-};
-static inline void
-put_mdr(struct idmap_defer_req *mdr)
-{
-        if (atomic_dec_and_test(&mdr->count))
-                kfree(mdr);
-}
-static inline void
-get_mdr(struct idmap_defer_req *mdr)
-{
-        atomic_inc(&mdr->count);
-}
-static void
-idmap_revisit(struct cache_deferred_req *dreq, int toomany)
-{
-        struct idmap_defer_req *mdr =
-                container_of(dreq, struct idmap_defer_req, deferred_req);
-        wake_up(&mdr->waitq);
-        put_mdr(mdr);
-}
-static struct cache_deferred_req *
-idmap_defer(struct cache_req *req)
-{
-        struct idmap_defer_req *mdr =
-                container_of(req, struct idmap_defer_req, req);
-        mdr->deferred_req.revisit = idmap_revisit;
-        get_mdr(mdr);
-        return (&mdr->deferred_req);
-}
-static inline int
-do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key,
-                struct cache_detail *detail, struct ent **item,
-                struct idmap_defer_req *mdr)
-{
-        *item = lookup_fn(key);
-        if (!*item)
-                return -ENOMEM;
-        return cache_check(detail, &(*item)->h, &mdr->req);
-}
-static inline int
-do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
-                        struct ent *key, struct cache_detail *detail,
-                        struct ent **item)
-{
-        int ret = -ENOMEM;
-        *item = lookup_fn(key);
-        if (!*item)
-                goto out_err;
-        ret = -ETIMEDOUT;
-        if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-                        || (*item)->h.expiry_time < get_seconds()
-                        || detail->flush_time > (*item)->h.last_refresh)
-                goto out_put;
-        ret = -ENOENT;
-        if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
-                goto out_put;
-        return 0;
-out_put:
-        cache_put(&(*item)->h, detail);
-out_err:
-        *item = NULL;
-        return ret;
-}
 static int
 idmap_lookup(struct svc_rqst *rqstp,
                struct ent *(*lookup_fn)(struct ent *), struct ent *key,
                struct cache_detail *detail, struct ent **item)
 {
-        struct idmap_defer_req *mdr;
        int ret;
-        mdr = kzalloc(sizeof(*mdr), GFP_KERNEL);
+        *item = lookup_fn(key);
-        if (!mdr)
+        if (!*item)
                return -ENOMEM;
-        atomic_set(&mdr->count, 1);
+ retry:
-        init_waitqueue_head(&mdr->waitq);
+        ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
-        mdr->req.defer = idmap_defer;
-        ret = do_idmap_lookup(lookup_fn, key, detail, item, mdr);
+        if (ret == -ETIMEDOUT) {
-        if (ret == -EAGAIN) {
+                struct ent *prev_item = *item;
-                wait_event_interruptible_timeout(mdr->waitq,
+                *item = lookup_fn(key);
-                        test_bit(CACHE_VALID, &(*item)->h.flags), 1 * HZ);
+                if (*item != prev_item)
-                ret = do_idmap_lookup_nowait(lookup_fn, key, detail, item);
+                        goto retry;
+                cache_put(&(*item)->h, detail);
        }
-        put_mdr(mdr);
        return ret;
 }
@@ -597,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp)
        return clp->name;
 }
-static int
+static __be32
 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
                uid_t *id)
 {
@@ -607,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
        int ret;
        if (namelen + 1 > sizeof(key.name))
-                return -EINVAL;
+                return nfserr_badowner;
        memcpy(key.name, name, namelen);
        key.name[namelen] = '\0';
        strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
        ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
        if (ret == -ENOENT)
-                ret = -ESRCH; /* nfserr_badname */
+                return nfserr_badowner;
        if (ret)
-                return ret;
+                return nfserrno(ret);
        *id = item->id;
        cache_put(&item->h, &nametoid_cache);
        return 0;
@@ -643,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
        return ret;
 }
-int
+__be32
 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
                __u32 *id)
 {
        return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
 }
-int
+__be32
 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
                __u32 *id)
 {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 59ec449b0c7f..db52546143d1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        return status;
 }
-static __be32
+static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
-nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
-              void *arg)
 {
        struct svc_fh tmp_fh;
        __be32 ret;
@@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        ret = exp_pseudoroot(rqstp, &tmp_fh);
        if (ret)
                return ret;
-        if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) {
+        if (tmp_fh.fh_dentry == fh->fh_dentry) {
                fh_put(&tmp_fh);
                return nfserr_noent;
        }
        fh_put(&tmp_fh);
-        return nfsd_lookup(rqstp, &cstate->current_fh,
+        return nfsd_lookup(rqstp, fh, "..", 2, fh);
-                           "..", 2, &cstate->current_fh);
+}
+static __be32
+nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+              void *arg)
+{
+        return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
 }
 static __be32
@@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        } else
                secinfo->si_exp = exp;
        dput(dentry);
+        if (cstate->minorversion)
+                /* See rfc 5661 section 2.6.3.1.1.8 */
+                fh_put(&cstate->current_fh);
        return err;
 }
 static __be32
+nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+              struct nfsd4_secinfo_no_name *sin)
+{
+        __be32 err;
+        switch (sin->sin_style) {
+        case NFS4_SECINFO_STYLE4_CURRENT_FH:
+                break;
+        case NFS4_SECINFO_STYLE4_PARENT:
+                err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+                if (err)
+                        return err;
+                break;
+        default:
+                return nfserr_inval;
+        }
+        exp_get(cstate->current_fh.fh_export);
+        sin->sin_exp = cstate->current_fh.fh_export;
+        fh_put(&cstate->current_fh);
+        return nfs_ok;
+}
+static __be32
 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
              struct nfsd4_setattr *setattr)
 {
@@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
 * Also note, enforced elsewhere:
 *      - SEQUENCE other than as first op results in
 *        NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
- *      - BIND_CONN_TO_SESSION must be the only op in its compound
+ *      - BIND_CONN_TO_SESSION must be the only op in its compound.
- *        (Will be enforced in nfsd4_bind_conn_to_session().)
+ *        (Enforced in nfsd4_bind_conn_to_session().)
 *      - DESTROY_SESSION must be the final operation in a compound, if
 *        sessionid's in SEQUENCE and DESTROY_SESSION are the same.
 *        (Enforced in nfsd4_destroy_session().)
@@ -1031,8 +1061,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        resp->cstate.session = NULL;
        fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
        fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
-        /* Use the deferral mechanism only for NFSv4.0 compounds */
+        /*
-        rqstp->rq_usedeferral = (args->minorversion == 0);
+         * Don't use the deferral mechanism for NFSv4; compounds make it
+         * too hard to avoid non-idempotency problems.
+         */
+        rqstp->rq_usedeferral = 0;
        /*
         * According to RFC3010, this takes precedence over all other errors.
@@ -1123,10 +1156,6 @@ encode_op:
                nfsd4_increment_op_stats(op->opnum);
        }
-        if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
-                dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
-                status = nfserr_jukebox;
-        }
        resp->cstate.status = status;
        fh_put(&resp->cstate.current_fh);
@@ -1297,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
                .op_name = "OP_EXCHANGE_ID",
        },
+        [OP_BIND_CONN_TO_SESSION] = {
+                .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_BIND_CONN_TO_SESSION",
+        },
        [OP_CREATE_SESSION] = {
                .op_func = (nfsd4op_func)nfsd4_create_session,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
@@ -1317,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_flags = ALLOWED_WITHOUT_FH,
                .op_name = "OP_RECLAIM_COMPLETE",
        },
+        [OP_SECINFO_NO_NAME] = {
+                .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+                .op_name = "OP_SECINFO_NO_NAME",
+        },
 };
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7e26caab2a26..ffb59ef6f82f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
        int status;
-        /* note: we currently use this path only for minorversion 0 */
        if (nfs4_has_reclaimed_state(child->d_name.name, false))
                return 0;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cf0d2ffb3c84..d98d0213285d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -33,7 +33,7 @@
 */
 #include <linux/file.h>
-#include <linux/smp_lock.h>
+#include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/swap.h>
@@ -207,7 +207,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 {
        struct nfs4_delegation *dp;
        struct nfs4_file *fp = stp->st_file;
-        struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
        dprintk("NFSD alloc_init_deleg\n");
        /*
@@ -231,10 +230,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_client = clp;
        get_nfs4_file(fp);
        dp->dl_file = fp;
-        nfs4_file_get_access(fp, O_RDONLY);
+        dp->dl_vfs_file = find_readable_file(fp);
+        get_file(dp->dl_vfs_file);
        dp->dl_flock = NULL;
        dp->dl_type = type;
-        dp->dl_ident = cb->cb_ident;
        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
@@ -254,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
        if (atomic_dec_and_test(&dp->dl_count)) {
                dprintk("NFSD: freeing dp %p\n",dp);
                put_nfs4_file(dp->dl_file);
+                fput(dp->dl_vfs_file);
                kmem_cache_free(deleg_slab, dp);
                num_delegations--;
        }
@@ -267,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 static void
 nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-        struct file *filp = find_readable_file(dp->dl_file);
        dprintk("NFSD: close_delegation dp %p\n",dp);
+        /* XXX: do we even need this check?: */
        if (dp->dl_flock)
-                vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
+                vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
-        nfs4_file_put_access(dp->dl_file, O_RDONLY);
 }
 /* Called under the state lock. */
@@ -535,171 +533,278 @@ gen_sessionid(struct nfsd4_session *ses)
 */
 #define NFSD_MIN_HDR_SEQ_SZ  (24 + 12 + 44)
+static void
+free_session_slots(struct nfsd4_session *ses)
+{
+        int i;
+        for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+                kfree(ses->se_slots[i]);
+}
 /*
- * Give the client the number of ca_maxresponsesize_cached slots it
+ * We don't actually need to cache the rpc and session headers, so we
- * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
+ * can allocate a little less for each slot:
- * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
+ */
- * than NFSD_MAX_SLOTS_PER_SESSION.
+static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
- *
+{
- * If we run out of reserved DRC memory we should (up to a point)
+        return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+}
+static int nfsd4_sanitize_slot_size(u32 size)
+{
+        size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
+        size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
+        return size;
+}
+/*
+ * XXX: If we run out of reserved DRC memory we could (up to a point)
 * re-negotiate active sessions and reduce their slot usage to make
 * rooom for new connections. For now we just fail the create session.
 */
-static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
+static int nfsd4_get_drc_mem(int slotsize, u32 num)
 {
-        int mem, size = fchan->maxresp_cached;
+        int avail;
-        if (fchan->maxreqs < 1)
+        num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
-                return nfserr_inval;
-        if (size < NFSD_MIN_HDR_SEQ_SZ)
+        spin_lock(&nfsd_drc_lock);
-                size = NFSD_MIN_HDR_SEQ_SZ;
+        avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
-        size -= NFSD_MIN_HDR_SEQ_SZ;
+                        nfsd_drc_max_mem - nfsd_drc_mem_used);
-        if (size > NFSD_SLOT_CACHE_SIZE)
+        num = min_t(int, num, avail / slotsize);
-                size = NFSD_SLOT_CACHE_SIZE;
+        nfsd_drc_mem_used += num * slotsize;
+        spin_unlock(&nfsd_drc_lock);
-        /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
-        mem = fchan->maxreqs * size;
+        return num;
-        if (mem > NFSD_MAX_MEM_PER_SESSION) {
+}
-                fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
-                if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
-                        fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
-                mem = fchan->maxreqs * size;
-        }
+static void nfsd4_put_drc_mem(int slotsize, int num)
+{
        spin_lock(&nfsd_drc_lock);
-        /* bound the total session drc memory ussage */
+        nfsd_drc_mem_used -= slotsize * num;
-        if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
-                fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
-                mem = fchan->maxreqs * size;
-        }
-        nfsd_drc_mem_used += mem;
        spin_unlock(&nfsd_drc_lock);
+}
-        if (fchan->maxreqs == 0)
+static struct nfsd4_session *alloc_session(int slotsize, int numslots)
-                return nfserr_jukebox;
+{
+        struct nfsd4_session *new;
+        int mem, i;
-        fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
+        BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
-        return 0;
+                        + sizeof(struct nfsd4_session) > PAGE_SIZE);
+        mem = numslots * sizeof(struct nfsd4_slot *);
+        new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+        if (!new)
+                return NULL;
+        /* allocate each struct nfsd4_slot and data cache in one piece */
+        for (i = 0; i < numslots; i++) {
+                mem = sizeof(struct nfsd4_slot) + slotsize;
+                new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
+                if (!new->se_slots[i])
+                        goto out_free;
+        }
+        return new;
+out_free:
+        while (i--)
+                kfree(new->se_slots[i]);
+        kfree(new);
+        return NULL;
 }
-/*
+static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
- * fchan holds the client values on input, and the server values on output
- * sv_max_mesg is the maximum payload plus one page for overhead.
- */
-static int init_forechannel_attrs(struct svc_rqst *rqstp,
-                                  struct nfsd4_channel_attrs *session_fchan,
-                                  struct nfsd4_channel_attrs *fchan)
 {
-        int status = 0;
+        u32 maxrpc = nfsd_serv->sv_max_mesg;
-        __u32   maxcount = nfsd_serv->sv_max_mesg;
-        /* headerpadsz set to zero in encode routine */
+        new->maxreqs = numslots;
+        new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
+        new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
+        new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
+        new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+}
-        /* Use the client's max request and max response size if possible */
+static void free_conn(struct nfsd4_conn *c)
-        if (fchan->maxreq_sz > maxcount)
+{
-                fchan->maxreq_sz = maxcount;
+        svc_xprt_put(c->cn_xprt);
-        session_fchan->maxreq_sz = fchan->maxreq_sz;
+        kfree(c);
+}
-        if (fchan->maxresp_sz > maxcount)
+static void nfsd4_conn_lost(struct svc_xpt_user *u)
-                fchan->maxresp_sz = maxcount;
+{
-        session_fchan->maxresp_sz = fchan->maxresp_sz;
+        struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
+        struct nfs4_client *clp = c->cn_session->se_client;
-        /* Use the client's maxops if possible */
+        spin_lock(&clp->cl_lock);
-        if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+        if (!list_empty(&c->cn_persession)) {
-                fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+                list_del(&c->cn_persession);
-        session_fchan->maxops = fchan->maxops;
+                free_conn(c);
+        }
+        spin_unlock(&clp->cl_lock);
+        nfsd4_probe_callback(clp);
+}
-        /* FIXME: Error means no more DRC pages so the server should
+static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
-         * recover pages from existing sessions. For now fail session
+{
-         * creation.
+        struct nfsd4_conn *conn;
-         */
-        status = set_forechannel_drc_size(fchan);
-        session_fchan->maxresp_cached = fchan->maxresp_cached;
+        conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
-        session_fchan->maxreqs = fchan->maxreqs;
+        if (!conn)
+                return NULL;
+        svc_xprt_get(rqstp->rq_xprt);
+        conn->cn_xprt = rqstp->rq_xprt;
+        conn->cn_flags = flags;
+        INIT_LIST_HEAD(&conn->cn_xpt_user.list);
+        return conn;
+}
-        dprintk("%s status %d\n", __func__, status);
+static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
-        return status;
+{
+        conn->cn_session = ses;
+        list_add(&conn->cn_persession, &ses->se_conns);
 }
-static void
+static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
-free_session_slots(struct nfsd4_session *ses)
 {
-        int i;
+        struct nfs4_client *clp = ses->se_client;
-        for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+        spin_lock(&clp->cl_lock);
-                kfree(ses->se_slots[i]);
+        __nfsd4_hash_conn(conn, ses);
+        spin_unlock(&clp->cl_lock);
 }
-/*
+static int nfsd4_register_conn(struct nfsd4_conn *conn)
- * We don't actually need to cache the rpc and session headers, so we
- * can allocate a little less for each slot:
- */
-static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
 {
-        return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+        conn->cn_xpt_user.callback = nfsd4_conn_lost;
+        return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
-static int
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
-alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
-                   struct nfsd4_create_session *cses)
 {
-        struct nfsd4_session *new, tmp;
+        struct nfsd4_conn *conn;
-        struct nfsd4_slot *sp;
+        int ret;
-        int idx, slotsize, cachesize, i;
-        int status;
-        memset(&tmp, 0, sizeof(tmp));
+        conn = alloc_conn(rqstp, dir);
+        if (!conn)
+                return nfserr_jukebox;
+        nfsd4_hash_conn(conn, ses);
+        ret = nfsd4_register_conn(conn);
+        if (ret)
+                /* oops; xprt is already down: */
+                nfsd4_conn_lost(&conn->cn_xpt_user);
+        return nfs_ok;
+}
-        /* FIXME: For now, we just accept the client back channel attributes. */
+static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
-        tmp.se_bchannel = cses->back_channel;
+{
-        status = init_forechannel_attrs(rqstp, &tmp.se_fchannel,
+        u32 dir = NFS4_CDFC4_FORE;
-                                        &cses->fore_channel);
-        if (status)
-                goto out;
-        BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
+        if (ses->se_flags & SESSION4_BACK_CHAN)
-                     + sizeof(struct nfsd4_session) > PAGE_SIZE);
+                dir |= NFS4_CDFC4_BACK;
-        status = nfserr_jukebox;
+        return nfsd4_new_conn(rqstp, ses, dir);
-        /* allocate struct nfsd4_session and slot table pointers in one piece */
+}
-        slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
-        new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
-        if (!new)
-                goto out;
-        memcpy(new, &tmp, sizeof(*new));
+/* must be called under client_lock */
+static void nfsd4_del_conns(struct nfsd4_session *s)
+{
+        struct nfs4_client *clp = s->se_client;
+        struct nfsd4_conn *c;
-        /* allocate each struct nfsd4_slot and data cache in one piece */
+        spin_lock(&clp->cl_lock);
-        cachesize = slot_bytes(&new->se_fchannel);
+        while (!list_empty(&s->se_conns)) {
-        for (i = 0; i < new->se_fchannel.maxreqs; i++) {
+                c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
-                sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
+                list_del_init(&c->cn_persession);
-                if (!sp)
+                spin_unlock(&clp->cl_lock);
-                        goto out_free;
-                new->se_slots[i] = sp;
+                unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
+                free_conn(c);
+                spin_lock(&clp->cl_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+void free_session(struct kref *kref)
+{
+        struct nfsd4_session *ses;
+        int mem;
+        ses = container_of(kref, struct nfsd4_session, se_ref);
+        nfsd4_del_conns(ses);
+        spin_lock(&nfsd_drc_lock);
+        mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
+        nfsd_drc_mem_used -= mem;
+        spin_unlock(&nfsd_drc_lock);
+        free_session_slots(ses);
+        kfree(ses);
+}
+static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+{
+        struct nfsd4_session *new;
+        struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
+        int numslots, slotsize;
+        int status;
+        int idx;
+        /*
+         * Note decreasing slot size below client's request may
+         * make it difficult for client to function correctly, whereas
+         * decreasing the number of slots will (just?) affect
+         * performance.  When short on memory we therefore prefer to
+         * decrease number of slots instead of their size.
+         */
+        slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
+        numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+        if (numslots < 1)
+                return NULL;
+        new = alloc_session(slotsize, numslots);
+        if (!new) {
+                nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
+                return NULL;
+        }
+        init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
        new->se_client = clp;
        gen_sessionid(new);
-        idx = hash_sessionid(&new->se_sessionid);
-        memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
-               NFS4_MAX_SESSIONID_LEN);
+        INIT_LIST_HEAD(&new->se_conns);
+        new->se_cb_seq_nr = 1;
        new->se_flags = cses->flags;
+        new->se_cb_prog = cses->callback_prog;
        kref_init(&new->se_ref);
+        idx = hash_sessionid(&new->se_sessionid);
        spin_lock(&client_lock);
        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+        spin_lock(&clp->cl_lock);
        list_add(&new->se_perclnt, &clp->cl_sessions);
+        spin_unlock(&clp->cl_lock);
        spin_unlock(&client_lock);
-        status = nfs_ok;
+        status = nfsd4_new_conn_from_crses(rqstp, new);
-out:
+        /* whoops: benny points out, status is ignored! (err, or bogus) */
-        return status;
+        if (status) {
-out_free:
+                free_session(&new->se_ref);
-        free_session_slots(new);
+                return NULL;
-        kfree(new);
+        }
-        goto out;
+        if (cses->flags & SESSION4_BACK_CHAN) {
+                struct sockaddr *sa = svc_addr(rqstp);
+                /*
+                 * This is a little silly; with sessions there's no real
+                 * use for the callback address.  Use the peer address
+                 * as a reasonable default for now, but consider fixing
+                 * the rpc client not to require an address in the
+                 * future:
+                 */
+                rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
+                clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+        }
+        nfsd4_probe_callback(clp);
+        return new;
 }
 /* caller must hold client_lock */
@@ -728,22 +833,9 @@ static void
 unhash_session(struct nfsd4_session *ses)
 {
        list_del(&ses->se_hash);
+        spin_lock(&ses->se_client->cl_lock);
        list_del(&ses->se_perclnt);
-}
+        spin_unlock(&ses->se_client->cl_lock);
-void
-free_session(struct kref *kref)
-{
-        struct nfsd4_session *ses;
-        int mem;
-        ses = container_of(kref, struct nfsd4_session, se_ref);
-        spin_lock(&nfsd_drc_lock);
-        mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
-        nfsd_drc_mem_used -= mem;
-        spin_unlock(&nfsd_drc_lock);
-        free_session_slots(ses);
-        kfree(ses);
 }
 /* must be called under the client_lock */
@@ -812,6 +904,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static inline void
 free_client(struct nfs4_client *clp)
 {
+        while (!list_empty(&clp->cl_sessions)) {
+                struct nfsd4_session *ses;
+                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                                se_perclnt);
+                list_del(&ses->se_perclnt);
+                nfsd4_put_session(ses);
+        }
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
        kfree(clp->cl_principal);
@@ -838,15 +937,14 @@ release_session_client(struct nfsd4_session *session)
 static inline void
 unhash_client_locked(struct nfs4_client *clp)
 {
+        struct nfsd4_session *ses;
        mark_client_expired(clp);
        list_del(&clp->cl_lru);
-        while (!list_empty(&clp->cl_sessions)) {
+        spin_lock(&clp->cl_lock);
-                struct nfsd4_session  *ses;
+        list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
-                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                list_del_init(&ses->se_hash);
-                                 se_perclnt);
+        spin_unlock(&clp->cl_lock);
-                unhash_session(ses);
-                nfsd4_put_session(ses);
-        }
 }
 static void
@@ -875,7 +973,7 @@ expire_client(struct nfs4_client *clp)
                sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
                release_openowner(sop);
        }
-        nfsd4_set_callback_client(clp, NULL);
+        nfsd4_shutdown_callback(clp);
        if (clp->cl_cb_conn.cb_xprt)
                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
        list_del(&clp->cl_idhash);
@@ -960,6 +1058,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        if (clp == NULL)
                return NULL;
+        INIT_LIST_HEAD(&clp->cl_sessions);
        princ = svc_gss_principal(rqstp);
        if (princ) {
                clp->cl_principal = kstrdup(princ, GFP_KERNEL);
@@ -971,13 +1071,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
        atomic_set(&clp->cl_refcount, 0);
-        atomic_set(&clp->cl_cb_set, 0);
+        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
        INIT_LIST_HEAD(&clp->cl_idhash);
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
-        INIT_LIST_HEAD(&clp->cl_sessions);
        INIT_LIST_HEAD(&clp->cl_lru);
+        INIT_LIST_HEAD(&clp->cl_callbacks);
+        spin_lock_init(&clp->cl_lock);
+        INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
        clp->cl_time = get_seconds();
        clear_bit(0, &clp->cl_cb_slot_busy);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -986,7 +1088,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        clp->cl_flavor = rqstp->rq_flavor;
        copy_cred(&clp->cl_cred, &rqstp->rq_cred);
        gen_confirm(clp);
+        clp->cl_cb_session = NULL;
        return clp;
 }
@@ -1051,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid)
        return NULL;
 }
-/*
+static bool clp_used_exchangeid(struct nfs4_client *clp)
- * Return 1 iff clp's clientid establishment method matches the use_exchange_id
- * parameter. Matching is based on the fact the at least one of the
- * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
- *
- * FIXME: we need to unify the clientid namespaces for nfsv4.x
- * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
- * and SET_CLIENTID{,_CONFIRM}
- */
-static inline int
-match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
 {
-        bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+        return clp->cl_exchange_flags != 0;
-        return use_exchange_id == has_exchange_flags;
+} 
-}
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+find_confirmed_client_by_str(const char *dname, unsigned int hashval)
-                             bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname) &&
+                if (same_name(clp->cl_recdir, dname))
-                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
 }
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
-                               bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname) &&
+                if (same_name(clp->cl_recdir, dname))
-                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
 }
+static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
+{
+        switch (family) {
+        case AF_INET:
+                ((struct sockaddr_in *)sa)->sin_family = AF_INET;
+                ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
+                return;
+        case AF_INET6:
+                ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
+                ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
+                return;
+        }
+}
 static void
-gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
 {
-        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
+        struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
+        struct sockaddr *sa = svc_addr(rqstp);
+        u32 scopeid = rpc_get_scope_id(sa);
        unsigned short expected_family;
        /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1111,24 +1214,24 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
        else
                goto out_err;
-        cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+        conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
                                            se->se_callback_addr_len,
-                                            (struct sockaddr *) &cb->cb_addr,
+                                            (struct sockaddr *)&conn->cb_addr,
-                                            sizeof(cb->cb_addr));
+                                            sizeof(conn->cb_addr));
-        if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
+        if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
                goto out_err;
-        if (cb->cb_addr.ss_family == AF_INET6)
+        if (conn->cb_addr.ss_family == AF_INET6)
-                ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
+                ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
-        cb->cb_minorversion = 0;
+        conn->cb_prog = se->se_callback_prog;
-        cb->cb_prog = se->se_callback_prog;
+        conn->cb_ident = se->se_callback_ident;
-        cb->cb_ident = se->se_callback_ident;
+        rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
        return;
 out_err:
-        cb->cb_addr.ss_family = AF_UNSPEC;
+        conn->cb_addr.ss_family = AF_UNSPEC;
-        cb->cb_addrlen = 0;
+        conn->cb_addrlen = 0;
        dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
                "will not receive delegations\n",
                clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -1264,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        case SP4_NONE:
                break;
        case SP4_SSV:
-                return nfserr_encr_alg_unsupp;
+                return nfserr_serverfault;
        default:
                BUG();                          /* checked by xdr code */
        case SP4_MACH_CRED:
@@ -1281,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        nfs4_lock_state();
        status = nfs_ok;
-        conf = find_confirmed_client_by_str(dname, strhashval, true);
+        conf = find_confirmed_client_by_str(dname, strhashval);
        if (conf) {
+                if (!clp_used_exchangeid(conf)) {
+                        status = nfserr_clid_inuse; /* XXX: ? */
+                        goto out;
+                }
                if (!same_verf(&verf, &conf->cl_verifier)) {
                        /* 18.35.4 case 8 */
                        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
@@ -1323,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                goto out;
        }
-        unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+        unconf  = find_unconfirmed_client_by_str(dname, strhashval);
        if (unconf) {
                /*
                 * Possible retry or client restart.  Per 18.35.4 case 4,
@@ -1415,7 +1522,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 {
        struct sockaddr *sa = svc_addr(rqstp);
        struct nfs4_client *conf, *unconf;
+        struct nfsd4_session *new;
        struct nfsd4_clid_slot *cs_slot = NULL;
+        bool confirm_me = false;
        int status = 0;
        nfs4_lock_state();
@@ -1438,7 +1547,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                                cs_slot->sl_seqid, cr_ses->seqid);
                        goto out;
                }
-                cs_slot->sl_seqid++;
        } else if (unconf) {
                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
                    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1451,25 +1559,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                if (status) {
                        /* an unconfirmed replay returns misordered */
                        status = nfserr_seq_misordered;
-                        goto out_cache;
+                        goto out;
                }
-                cs_slot->sl_seqid++; /* from 0 to 1 */
+                confirm_me = true;
-                move_to_confirmed(unconf);
-                if (cr_ses->flags & SESSION4_BACK_CHAN) {
-                        unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-                        svc_xprt_get(rqstp->rq_xprt);
-                        rpc_copy_addr(
-                                (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
-                                sa);
-                        unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-                        unconf->cl_cb_conn.cb_minorversion =
-                                cstate->minorversion;
-                        unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
-                        unconf->cl_cb_seq_nr = 1;
-                        nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
-                }
                conf = unconf;
        } else {
                status = nfserr_stale_clientid;
@@ -1477,22 +1570,32 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        }
        /*
+         * XXX: we should probably set this at creation time, and check
+         * for consistent minorversion use throughout:
+         */
+        conf->cl_minorversion = 1;
+        /*
         * We do not support RDMA or persistent sessions
         */
        cr_ses->flags &= ~SESSION4_PERSIST;
        cr_ses->flags &= ~SESSION4_RDMA;
-        status = alloc_init_session(rqstp, conf, cr_ses);
+        status = nfserr_jukebox;
-        if (status)
+        new = alloc_init_session(rqstp, conf, cr_ses);
+        if (!new)
                goto out;
+        status = nfs_ok;
-        memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+        memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
               NFS4_MAX_SESSIONID_LEN);
+        memcpy(&cr_ses->fore_channel, &new->se_fchannel,
+                sizeof(struct nfsd4_channel_attrs));
+        cs_slot->sl_seqid++;
        cr_ses->seqid = cs_slot->sl_seqid;
-out_cache:
        /* cache solo and embedded create sessions under the state lock */
        nfsd4_cache_create_session(cr_ses, cs_slot, status);
+        if (confirm_me)
+                move_to_confirmed(conf);
 out:
        nfs4_unlock_state();
        dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1507,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
        return argp->opcnt == resp->opcnt;
 }
+static __be32 nfsd4_map_bcts_dir(u32 *dir)
+{
+        switch (*dir) {
+        case NFS4_CDFC4_FORE:
+        case NFS4_CDFC4_BACK:
+                return nfs_ok;
+        case NFS4_CDFC4_FORE_OR_BOTH:
+        case NFS4_CDFC4_BACK_OR_BOTH:
+                *dir = NFS4_CDFC4_BOTH;
+                return nfs_ok;
+        };
+        return nfserr_inval;
+}
+__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
+                     struct nfsd4_compound_state *cstate,
+                     struct nfsd4_bind_conn_to_session *bcts)
+{
+        __be32 status;
+        if (!nfsd4_last_compound_op(rqstp))
+                return nfserr_not_only_op;
+        spin_lock(&client_lock);
+        cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+        /* Sorta weird: we only need the refcnt'ing because new_conn acquires
+         * client_lock iself: */
+        if (cstate->session) {
+                nfsd4_get_session(cstate->session);
+                atomic_inc(&cstate->session->se_client->cl_refcount);
+        }
+        spin_unlock(&client_lock);
+        if (!cstate->session)
+                return nfserr_badsession;
+        status = nfsd4_map_bcts_dir(&bcts->dir);
+        nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+        return nfs_ok;
+}
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
 {
        if (!session)
@@ -1545,9 +1687,11 @@ nfsd4_destroy_session(struct svc_rqst *r,
        spin_unlock(&client_lock);
        nfs4_lock_state();
-        /* wait for callbacks */
+        nfsd4_probe_callback_sync(ses->se_client);
-        nfsd4_set_callback_client(ses->se_client, NULL);
        nfs4_unlock_state();
+        nfsd4_del_conns(ses);
        nfsd4_put_session(ses);
        status = nfs_ok;
 out:
@@ -1555,6 +1699,40 @@ out:
        return status;
 }
+static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
+{
+        struct nfsd4_conn *c;
+        list_for_each_entry(c, &s->se_conns, cn_persession) {
+                if (c->cn_xprt == xpt) {
+                        return c;
+                }
+        }
+        return NULL;
+}
+static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+{
+        struct nfs4_client *clp = ses->se_client;
+        struct nfsd4_conn *c;
+        int ret;
+        spin_lock(&clp->cl_lock);
+        c = __nfsd4_find_conn(new->cn_xprt, ses);
+        if (c) {
+                spin_unlock(&clp->cl_lock);
+                free_conn(new);
+                return;
+        }
+        __nfsd4_hash_conn(new, ses);
+        spin_unlock(&clp->cl_lock);
+        ret = nfsd4_register_conn(new);
+        if (ret)
+                /* oops; xprt is already down: */
+                nfsd4_conn_lost(&new->cn_xpt_user);
+        return;
+}
 __be32
 nfsd4_sequence(struct svc_rqst *rqstp,
               struct nfsd4_compound_state *cstate,
@@ -1563,11 +1741,20 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        struct nfsd4_compoundres *resp = rqstp->rq_resp;
        struct nfsd4_session *session;
        struct nfsd4_slot *slot;
+        struct nfsd4_conn *conn;
        int status;
        if (resp->opcnt != 1)
                return nfserr_sequence_pos;
+        /*
+         * Will be either used or freed by nfsd4_sequence_check_conn
+         * below.
+         */
+        conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
+        if (!conn)
+                return nfserr_jukebox;
        spin_lock(&client_lock);
        status = nfserr_badsession;
        session = find_in_sessionid_hashtbl(&seq->sessionid);
@@ -1599,6 +1786,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        if (status)
                goto out;
+        nfsd4_sequence_check_conn(conn, session);
+        conn = NULL;
        /* Success! bump slot seqid */
        slot->sl_inuse = true;
        slot->sl_seqid = seq->seqid;
@@ -1610,9 +1800,14 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 out:
        /* Hold a session reference until done processing the compound. */
        if (cstate->session) {
+                struct nfs4_client *clp = session->se_client;
                nfsd4_get_session(cstate->session);
-                atomic_inc(&session->se_client->cl_refcount);
+                atomic_inc(&clp->cl_refcount);
+                if (clp->cl_cb_state == NFSD4_CB_DOWN)
+                        seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
        }
+        kfree(conn);
        spin_unlock(&client_lock);
        dprintk("%s: return %d\n", __func__, ntohl(status));
        return status;
@@ -1651,7 +1846,6 @@ __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
 {
-        struct sockaddr         *sa = svc_addr(rqstp);
        struct xdr_netobj       clname = { 
                .len = setclid->se_namelen,
                .data = setclid->se_name,
@@ -1677,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        strhashval = clientstr_hashval(dname);
        nfs4_lock_state();
-        conf = find_confirmed_client_by_str(dname, strhashval, false);
+        conf = find_confirmed_client_by_str(dname, strhashval);
        if (conf) {
                /* RFC 3530 14.2.33 CASE 0: */
                status = nfserr_clid_inuse;
+                if (clp_used_exchangeid(conf))
+                        goto out;
                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
                        char addr_str[INET6_ADDRSTRLEN];
                        rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
@@ -1695,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * has a description of SETCLIENTID request processing consisting
         * of 5 bullet points, labeled as CASE0 - CASE4 below.
         */
-        unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
+        unconf = find_unconfirmed_client_by_str(dname, strhashval);
        status = nfserr_resource;
        if (!conf) {
                /*
@@ -1747,7 +1943,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                gen_clid(new);
        }
-        gen_callback(new, setclid, rpc_get_scope_id(sa));
+        /*
+         * XXX: we should probably set this at creation time, and check
+         * for consistent minorversion use throughout:
+         */
+        new->cl_minorversion = 0;
+        gen_callback(new, setclid, rqstp);
        add_to_unconfirmed(new, strhashval);
        setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
        setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1806,8 +2007,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
                        status = nfserr_clid_inuse;
                else {
-                        atomic_set(&conf->cl_cb_set, 0);
+                        nfsd4_change_callback(conf, &unconf->cl_cb_conn);
-                        nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
+                        nfsd4_probe_callback(conf);
                        expire_client(unconf);
                        status = nfs_ok;
@@ -1834,14 +2035,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        unsigned int hash =
                                clientstr_hashval(unconf->cl_recdir);
                        conf = find_confirmed_client_by_str(unconf->cl_recdir,
-                                                            hash, false);
+                                                            hash);
                        if (conf) {
                                nfsd4_remove_clid_dir(conf);
                                expire_client(conf);
                        }
                        move_to_confirmed(unconf);
                        conf = unconf;
-                        nfsd4_probe_callback(conf, &conf->cl_cb_conn);
+                        nfsd4_probe_callback(conf);
                        status = nfs_ok;
                }
        } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -2132,7 +2333,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
 * Spawn a thread to perform a recall on the delegation represented
 * by the lease (file_lock)
 *
- * Called from break_lease() with lock_kernel() held.
+ * Called from break_lease() with lock_flocks() held.
 * Note: we assume break_lease will only call this *once* for any given
 * lease.
 */
@@ -2156,7 +2357,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
        list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
        spin_unlock(&recall_lock);
-        /* only place dl_time is set. protected by lock_kernel*/
+        /* only place dl_time is set. protected by lock_flocks*/
        dp->dl_time = get_seconds();
        /*
@@ -2170,57 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
        nfsd4_cb_recall(dp);
 }
-/*
- * The file_lock is being reapd.
- *
- * Called by locks_free_lock() with lock_kernel() held.
- */
-static
-void nfsd_release_deleg_cb(struct file_lock *fl)
-{
-        struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-        dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count));
-        if (!(fl->fl_flags & FL_LEASE) || !dp)
-                return;
-        dp->dl_flock = NULL;
-}
-/*
- * Set the delegation file_lock back pointer.
- *
- * Called from setlease() with lock_kernel() held.
- */
-static
-void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
-{
-        struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner;
-        dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp);
-        if (!dp)
-                return;
-        dp->dl_flock = new;
-}
-/*
- * Called from setlease() with lock_kernel() held
- */
-static
-int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
-{
-        struct nfs4_delegation *onlistd =
-                (struct nfs4_delegation *)onlist->fl_owner;
-        struct nfs4_delegation *tryd =
-                (struct nfs4_delegation *)try->fl_owner;
-        if (onlist->fl_lmops != try->fl_lmops)
-                return 0;
-        return onlistd->dl_client == tryd->dl_client;
-}
 static
 int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 {
@@ -2232,9 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 static const struct lock_manager_operations nfsd_lease_mng_ops = {
        .fl_break = nfsd_break_deleg_cb,
-        .fl_release_private = nfsd_release_deleg_cb,
-        .fl_copy_lock = nfsd_copy_lock_deleg_cb,
-        .fl_mylease = nfsd_same_client_deleg_cb,
        .fl_change = nfsd_change_deleg_cb,
 };
@@ -2401,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
        if (!fp->fi_fds[oflag]) {
                status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
                        &fp->fi_fds[oflag]);
-                if (status == nfserr_dropit)
-                        status = nfserr_jukebox;
                if (status)
                        return status;
        }
@@ -2483,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open)
        open->op_stateowner->so_client->cl_firststate = 1;
 }
+/* Should we give out recallable state?: */
+static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
+{
+        if (clp->cl_cb_state == NFSD4_CB_UP)
+                return true;
+        /*
+         * In the sessions case, since we don't have to establish a
+         * separate connection for callbacks, we assume it's OK
+         * until we hear otherwise:
+         */
+        return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
+}
 /*
 * Attempt to hand out a delegation.
 */
@@ -2491,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
-        int cb_up = atomic_read(&sop->so_client->cl_cb_set);
+        int cb_up;
-        struct file_lock fl, *flp = &fl;
+        struct file_lock *fl;
        int status, flag = 0;
+        cb_up = nfsd4_cb_channel_good(sop->so_client);
        flag = NFS4_OPEN_DELEGATE_NONE;
        open->op_recall = 0;
        switch (open->op_claim_type) {
@@ -2526,21 +2685,28 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
                flag = NFS4_OPEN_DELEGATE_NONE;
                goto out;
        }
-        locks_init_lock(&fl);
+        status = -ENOMEM;
-        fl.fl_lmops = &nfsd_lease_mng_ops;
+        fl = locks_alloc_lock();
-        fl.fl_flags = FL_LEASE;
+        if (!fl)
-        fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+                goto out;
-        fl.fl_end = OFFSET_MAX;
+        locks_init_lock(fl);
-        fl.fl_owner =  (fl_owner_t)dp;
+        fl->fl_lmops = &nfsd_lease_mng_ops;
-        fl.fl_file = find_readable_file(stp->st_file);
+        fl->fl_flags = FL_LEASE;
-        BUG_ON(!fl.fl_file);
+        fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
-        fl.fl_pid = current->tgid;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_owner =  (fl_owner_t)dp;
+        fl->fl_file = find_readable_file(stp->st_file);
+        BUG_ON(!fl->fl_file);
+        fl->fl_pid = current->tgid;
+        dp->dl_flock = fl;
        /* vfs_setlease checks to see if delegation should be handed out.
-         * the lock_manager callbacks fl_mylease and fl_change are used
+         * the lock_manager callback fl_change is used
         */
-        if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) {
+        if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
                dprintk("NFSD: setlease failed [%d], no delegation\n", status);
+                dp->dl_flock = NULL;
+                locks_free_lock(fl);
                unhash_delegation(dp);
                flag = NFS4_OPEN_DELEGATE_NONE;
                goto out;
@@ -2674,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        renew_client(clp);
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
-                        && !atomic_read(&clp->cl_cb_set))
+                        && clp->cl_cb_state != NFSD4_CB_UP)
                goto out;
        status = nfs_ok;
 out:
@@ -2944,7 +3110,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
        if (STALE_STATEID(stateid)) 
                goto out;
-        status = nfserr_bad_stateid;
+        /*
+         * We assume that any stateid that has the current boot time,
+         * but that we can't find, is expired:
+         */
+        status = nfserr_expired;
        if (is_delegation_stateid(stateid)) {
                dp = find_delegation_stateid(ino, stateid);
                if (!dp)
@@ -2957,13 +3127,15 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                if (status)
                        goto out;
                renew_client(dp->dl_client);
-                if (filpp)
+                if (filpp) {
                        *filpp = find_readable_file(dp->dl_file);
-                BUG_ON(!*filpp);
+                        BUG_ON(!*filpp);
+                }
        } else { /* open or lock stateid */
                stp = find_stateid(stateid, flags);
                if (!stp)
                        goto out;
+                status = nfserr_bad_stateid;
                if (nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
@@ -3038,8 +3210,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                 * a replayed close:
                 */
                sop = search_close_lru(stateid->si_stateownerid, flags);
+                /* It's not stale; let's assume it's expired: */
                if (sop == NULL)
-                        return nfserr_bad_stateid;
+                        return nfserr_expired;
                *sopp = sop;
                goto check_replay;
        }
@@ -3304,6 +3477,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        status = nfserr_bad_stateid;
        if (!is_delegation_stateid(stateid))
                goto out;
+        status = nfserr_expired;
        dp = find_delegation_stateid(inode, stateid);
        if (!dp)
                goto out;
@@ -3895,7 +4069,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
        struct inode *inode = filp->fi_inode;
        int status = 0;
-        lock_kernel();
+        lock_flocks();
        for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
                if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
                        status = 1;
@@ -3903,7 +4077,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
                }
        }
 out:
-        unlock_kernel();
+        unlock_flocks();
        return status;
 }
@@ -3980,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
        unsigned int strhashval = clientstr_hashval(name);
        struct nfs4_client *clp;
-        clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
+        clp = find_confirmed_client_by_str(name, strhashval);
        return clp ? 1 : 0;
 }
@@ -4209,7 +4383,7 @@ __nfs4_state_shutdown(void)
 void
 nfs4_state_shutdown(void)
 {
-        cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
+        cancel_delayed_work_sync(&laundromat_work);
        destroy_workqueue(laundry_wq);
        locks_end_grace(&nfsd4_manager);
        nfs4_lock_state();
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1a468bbd330f..956629b9cdc9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/utsname.h>
-#include <linux/nfsd_idmap.h>
-#include <linux/nfs4_acl.h>
 #include <linux/sunrpc/svcauth_gss.h>
+#include "idmap.h"
+#include "acl.h"
 #include "xdr4.h"
 #include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
 /*
@@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                        len += XDR_QUADLEN(dummy32) << 2;
                        READMEM(buf, dummy32);
                        ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
-                        host_err = 0;
+                        status = nfs_ok;
                        if (ace->whotype != NFS4_ACL_WHO_NAMED)
                                ace->who = 0;
                        else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
-                                host_err = nfsd_map_name_to_gid(argp->rqstp,
+                                status = nfsd_map_name_to_gid(argp->rqstp,
                                                buf, dummy32, &ace->who);
                        else
-                                host_err = nfsd_map_name_to_uid(argp->rqstp,
+                                status = nfsd_map_name_to_uid(argp->rqstp,
                                                buf, dummy32, &ace->who);
-                        if (host_err)
+                        if (status)
-                                goto out_nfserr;
+                                return status;
                }
        } else
                *acl = NULL;
@@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
        DECODE_TAIL;
 }
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+        DECODE_HEAD;
+        u32 dummy;
+        READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+        COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        READ32(bcts->dir);
+        /* XXX: Perhaps Tom Tucker could help us figure out how we
+         * should be using ctsa_use_conn_in_rdma_mode: */
+        READ32(dummy);
+        DECODE_TAIL;
+}
 static __be32
 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
@@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
 }
 static __be32
+nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+                     struct nfsd4_secinfo_no_name *sin)
+{
+        DECODE_HEAD;
+        READ_BUF(4);
+        READ32(sin->sin_style);
+        DECODE_TAIL;
+}
+static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
        __be32 status;
@@ -1005,7 +1032,7 @@ static __be32
 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
                         struct nfsd4_exchange_id *exid)
 {
-        int dummy;
+        int dummy, tmp;
        DECODE_HEAD;
        READ_BUF(NFS4_VERIFIER_SIZE);
@@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
                /* ssp_hash_algs<> */
                READ_BUF(4);
-                READ32(dummy);
+                READ32(tmp);
-                READ_BUF(dummy);
+                while (tmp--) {
-                p += XDR_QUADLEN(dummy);
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                }
                /* ssp_encr_algs<> */
                READ_BUF(4);
-                READ32(dummy);
+                READ32(tmp);
-                READ_BUF(dummy);
+                while (tmp--) {
-                p += XDR_QUADLEN(dummy);
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                }
                /* ssp_window and ssp_num_gss_handles */
                READ_BUF(8);
@@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        /* new operations for NFSv4.1 */
        [OP_BACKCHANNEL_CTL]    = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
        [OP_EXCHANGE_ID]        = (nfsd4_dec)nfsd4_decode_exchange_id,
        [OP_CREATE_SESSION]     = (nfsd4_dec)nfsd4_decode_create_session,
        [OP_DESTROY_SESSION]    = (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_dec)nfsd4_decode_sequence,
        [OP_SET_SSV]            = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_TEST_STATEID]       = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -1805,19 +1840,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                                goto out_nfserr;
                }
        }
-        if ((buflen -= 16) < 0)
-                goto out_resource;
-        if (unlikely(bmval2)) {
+        if (bmval2) {
+                if ((buflen -= 16) < 0)
+                        goto out_resource;
                WRITE32(3);
                WRITE32(bmval0);
                WRITE32(bmval1);
                WRITE32(bmval2);
-        } else if (likely(bmval1)) {
+        } else if (bmval1) {
+                if ((buflen -= 12) < 0)
+                        goto out_resource;
                WRITE32(2);
                WRITE32(bmval0);
                WRITE32(bmval1);
        } else {
+                if ((buflen -= 8) < 0)
+                        goto out_resource;
                WRITE32(1);
                WRITE32(bmval0);
        }
@@ -1828,15 +1867,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                u32 word1 = nfsd_suppattrs1(minorversion);
                u32 word2 = nfsd_suppattrs2(minorversion);
-                if ((buflen -= 12) < 0)
-                        goto out_resource;
                if (!aclsupport)
                        word0 &= ~FATTR4_WORD0_ACL;
                if (!word2) {
+                        if ((buflen -= 12) < 0)
+                                goto out_resource;
                        WRITE32(2);
                        WRITE32(word0);
                        WRITE32(word1);
                } else {
+                        if ((buflen -= 16) < 0)
+                                goto out_resource;
                        WRITE32(3);
                        WRITE32(word0);
                        WRITE32(word1);
@@ -2303,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        case nfserr_resource:
                nfserr = nfserr_toosmall;
                goto fail;
-        case nfserr_dropit:
-                goto fail;
        case nfserr_noent:
                goto skip_entry;
        default:
@@ -2359,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
        return nfserr;
 }
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+{
+        __be32 *p;
+        if (!nfserr) {
+                RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
+                WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+                WRITE32(bcts->dir);
+                /* XXX: ? */
+                WRITE32(0);
+                ADJUST_ARGS();
+        }
+        return nfserr;
+}
 static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
@@ -2820,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 }
 static __be32
-nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
-                     struct nfsd4_secinfo *secinfo)
+                         __be32 nfserr,struct svc_export *exp)
 {
        int i = 0;
-        struct svc_export *exp = secinfo->si_exp;
        u32 nflavs;
        struct exp_flavor_info *flavs;
        struct exp_flavor_info def_flavs[2];
@@ -2886,6 +2939,20 @@ out:
        return nfserr;
 }
+static __be32
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+                     struct nfsd4_secinfo *secinfo)
+{
+        return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
+}
+static __be32
+nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+                     struct nfsd4_secinfo_no_name *secinfo)
+{
+        return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
+}
 /*
 * The SETATTR encode routine is special -- it always encodes a bitmap,
 * regardless of the error status.
@@ -3070,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
        WRITE32(seq->seqid);
        WRITE32(seq->slotid);
        WRITE32(seq->maxslots);
-        /*
+        /* For now: target_maxslots = maxslots */
-         * FIXME: for now:
-         *   target_maxslots = maxslots
-         *   status_flags = 0
-         */
        WRITE32(seq->maxslots);
-        WRITE32(0);
+        WRITE32(seq->status_flags);
        ADJUST_ARGS();
        resp->cstate.datap = p; /* DRC cache data pointer */
@@ -3137,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        /* NFSv4.1 operations */
        [OP_BACKCHANNEL_CTL]    = (nfsd4_enc)nfsd4_encode_noop,
-        [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
        [OP_EXCHANGE_ID]        = (nfsd4_enc)nfsd4_encode_exchange_id,
        [OP_CREATE_SESSION]     = (nfsd4_enc)nfsd4_encode_create_session,
        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_destroy_session,
@@ -3148,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_noop,
-        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_enc)nfsd4_encode_sequence,
        [OP_SET_SSV]            = (nfsd4_enc)nfsd4_encode_noop,
        [OP_TEST_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b53b1d042f1f..33b3e2b06779 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
 #include <linux/namei.h>
 #include <linux/ctype.h>
-#include <linux/nfsd_idmap.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
+#include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
@@ -22,6 +22,7 @@
 */
 enum {
        NFSD_Root = 1,
+#ifdef CONFIG_NFSD_DEPRECATED
        NFSD_Svc,
        NFSD_Add,
        NFSD_Del,
@@ -29,6 +30,7 @@ enum {
        NFSD_Unexport,
        NFSD_Getfd,
        NFSD_Getfs,
+#endif
        NFSD_List,
        NFSD_Export_features,
        NFSD_Fh,
@@ -54,6 +56,7 @@ enum {
 /*
 * write() for these nodes.
 */
+#ifdef CONFIG_NFSD_DEPRECATED
 static ssize_t write_svc(struct file *file, char *buf, size_t size);
 static ssize_t write_add(struct file *file, char *buf, size_t size);
 static ssize_t write_del(struct file *file, char *buf, size_t size);
@@ -61,6 +64,7 @@ static ssize_t write_export(struct file *file, char *buf, size_t size);
 static ssize_t write_unexport(struct file *file, char *buf, size_t size);
 static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
+#endif
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
 static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
 static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
@@ -76,6 +80,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+#ifdef CONFIG_NFSD_DEPRECATED
        [NFSD_Svc] = write_svc,
        [NFSD_Add] = write_add,
        [NFSD_Del] = write_del,
@@ -83,6 +88,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Unexport] = write_unexport,
        [NFSD_Getfd] = write_getfd,
        [NFSD_Getfs] = write_getfs,
+#endif
        [NFSD_Fh] = write_filehandle,
        [NFSD_FO_UnlockIP] = write_unlock_ip,
        [NFSD_FO_UnlockFS] = write_unlock_fs,
@@ -121,6 +127,16 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
 static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
 {
+#ifdef CONFIG_NFSD_DEPRECATED
+        static int warned;
+        if (file->f_dentry->d_name.name[0] == '.' && !warned) {
+                printk(KERN_INFO
+                       "Warning: \"%s\" uses deprecated NFSD interface: %s."
+                       "  This will be removed in 2.6.40\n",
+                       current->comm, file->f_dentry->d_name.name);
+                warned = 1;
+        }
+#endif
        if (! file->private_data) {
                /* An attempt to read a transaction file without writing
                 * causes a 0-byte write so that the file can return
@@ -137,6 +153,7 @@ static const struct file_operations transaction_ops = {
        .write          = nfsctl_transaction_write,
        .read           = nfsctl_transaction_read,
        .release        = simple_transaction_release,
+        .llseek         = default_llseek,
 };
 static int exports_open(struct inode *inode, struct file *file)
@@ -186,6 +203,7 @@ static const struct file_operations pool_stats_operations = {
 * payload - write methods
 */
+#ifdef CONFIG_NFSD_DEPRECATED
 /**
 * write_svc - Start kernel's NFSD server
 *
@@ -401,7 +419,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
        ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
-        clp = auth_unix_lookup(&in6);
+        clp = auth_unix_lookup(&init_net, &in6);
        if (!clp)
                err = -EPERM;
        else {
@@ -464,7 +482,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
        ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
-        clp = auth_unix_lookup(&in6);
+        clp = auth_unix_lookup(&init_net, &in6);
        if (!clp)
                err = -EPERM;
        else {
@@ -481,6 +499,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 out:
        return err;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 /**
 * write_unlock_ip - Release all locks used by a client
@@ -999,12 +1018,12 @@ static ssize_t __write_ports_addxprt(char *buf)
        if (err != 0)
                return err;
-        err = svc_create_xprt(nfsd_serv, transport,
+        err = svc_create_xprt(nfsd_serv, transport, &init_net,
                                PF_INET, port, SVC_SOCK_ANONYMOUS);
        if (err < 0)
                goto out_err;
-        err = svc_create_xprt(nfsd_serv, transport,
+        err = svc_create_xprt(nfsd_serv, transport, &init_net,
                                PF_INET6, port, SVC_SOCK_ANONYMOUS);
        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_close;
@@ -1355,6 +1374,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 {
        static struct tree_descr nfsd_files[] = {
+#ifdef CONFIG_NFSD_DEPRECATED
                [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
                [NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
                [NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
@@ -1362,6 +1382,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
                [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
+#endif
                [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
                [NFSD_Export_features] = {"export_features",
                                        &export_features_operations, S_IRUGO},
@@ -1386,16 +1407,16 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
        return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
-static int nfsd_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfsd_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
+        return mount_single(fs_type, flags, data, nfsd_fill_super);
 }
 static struct file_system_type nfsd_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfsd",
-        .get_sb         = nfsd_get_sb,
+        .mount          = nfsd_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index b76ac3a82e39..7ecfa2420307 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -158,6 +158,7 @@ void		nfsd_lockd_shutdown(void);
 #define nfserr_attrnotsupp      cpu_to_be32(NFSERR_ATTRNOTSUPP)
 #define nfserr_bad_xdr          cpu_to_be32(NFSERR_BAD_XDR)
 #define nfserr_openmode         cpu_to_be32(NFSERR_OPENMODE)
+#define nfserr_badowner         cpu_to_be32(NFSERR_BADOWNER)
 #define nfserr_locks_held       cpu_to_be32(NFSERR_LOCKS_HELD)
 #define nfserr_op_illegal       cpu_to_be32(NFSERR_OP_ILLEGAL)
 #define nfserr_grace            cpu_to_be32(NFSERR_GRACE)
@@ -249,7 +250,7 @@ extern time_t nfsd4_grace;
 #define COMPOUND_SLACK_SPACE            140    /* OP_GETFH */
 #define COMPOUND_ERR_SLACK_SPACE        12     /* OP_SETATTR */
-#define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
+#define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
 /*
 * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08e17264784b..e15dc45fc5ec 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -735,9 +735,9 @@ nfserrno (int errno)
                { nfserr_stale, -ESTALE },
                { nfserr_jukebox, -ETIMEDOUT },
                { nfserr_jukebox, -ERESTARTSYS },
-                { nfserr_dropit, -EAGAIN },
+                { nfserr_jukebox, -EAGAIN },
-                { nfserr_dropit, -ENOMEM },
+                { nfserr_jukebox, -EWOULDBLOCK },
-                { nfserr_badname, -ESRCH },
+                { nfserr_jukebox, -ENOMEM },
                { nfserr_io, -ETXTBSY },
                { nfserr_notsupp, -EOPNOTSUPP },
                { nfserr_toosmall, -ETOOSMALL },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e2c43464f237..18743c4d8bca 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -16,6 +16,7 @@
 #include <linux/lockd/bind.h>
 #include <linux/nfsacl.h>
 #include <linux/seq_file.h>
+#include <net/net_namespace.h>
 #include "nfsd.h"
 #include "cache.h"
 #include "vfs.h"
@@ -186,12 +187,12 @@ static int nfsd_init_socks(int port)
        if (!list_empty(&nfsd_serv->sv_permsocks))
                return 0;
-        error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
+        error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
-        error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
+        error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
@@ -607,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
        /* Now call the procedure handler, and encode NFS status. */
        nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
        nfserr = map_new_errors(rqstp->rq_vers, nfserr);
-        if (nfserr == nfserr_dropit) {
+        if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
                dprintk("nfsd: Dropping request; may be revisited later\n");
                nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
                return 0;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 322518c88e4b..3074656ba7bf 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
 #ifndef _NFSD4_STATE_H
 #define _NFSD4_STATE_H
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/nfsd/nfsfh.h>
 #include "nfsfh.h"
@@ -64,20 +65,15 @@ typedef struct {
        (s)->si_fileid, \
        (s)->si_generation
-struct nfsd4_cb_sequence {
-        /* args/res */
-        u32                     cbs_minorversion;
-        struct nfs4_client      *cbs_clp;
-};
-struct nfs4_rpc_args {
-        void                            *args_op;
-        struct nfsd4_cb_sequence        args_seq;
-};
 struct nfsd4_callback {
-        struct nfs4_rpc_args cb_args;
+        void *cb_op;
+        struct nfs4_client *cb_clp;
+        struct list_head cb_per_client;
+        u32 cb_minorversion;
+        struct rpc_message cb_msg;
+        const struct rpc_call_ops *cb_ops;
        struct work_struct cb_work;
+        bool cb_done;
 };
 struct nfs4_delegation {
@@ -87,11 +83,11 @@ struct nfs4_delegation {
        atomic_t                dl_count;       /* ref count */
        struct nfs4_client      *dl_client;
        struct nfs4_file        *dl_file;
+        struct file             *dl_vfs_file;
        struct file_lock        *dl_flock;
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
-        u32                     dl_ident;
        stateid_t               dl_stateid;
        struct knfsd_fh         dl_fh;
        int                     dl_retries;
@@ -102,9 +98,10 @@ struct nfs4_delegation {
 struct nfs4_cb_conn {
        /* SETCLIENTID info */
        struct sockaddr_storage cb_addr;
+        struct sockaddr_storage cb_saddr;
        size_t                  cb_addrlen;
-        u32                     cb_prog;
+        u32                     cb_prog; /* used only in 4.0 case;
-        u32                     cb_minorversion;
+                                            per-session otherwise */
        u32                     cb_ident;       /* minorversion 0 only */
        struct svc_xprt         *cb_xprt;       /* minorversion 1 only */
 };
@@ -153,6 +150,11 @@ struct nfsd4_create_session {
        u32                             gid;
 };
+struct nfsd4_bind_conn_to_session {
+        struct nfs4_sessionid           sessionid;
+        u32                             dir;
+};
 /* The single slot clientid cache structure */
 struct nfsd4_clid_slot {
        u32                             sl_seqid;
@@ -160,6 +162,15 @@ struct nfsd4_clid_slot {
        struct nfsd4_create_session     sl_cr_ses;
 };
+struct nfsd4_conn {
+        struct list_head cn_persession;
+        struct svc_xprt *cn_xprt;
+        struct svc_xpt_user cn_xpt_user;
+        struct nfsd4_session *cn_session;
+/* CDFC4_FORE, CDFC4_BACK: */
+        unsigned char cn_flags;
+};
 struct nfsd4_session {
        struct kref             se_ref;
        struct list_head        se_hash;        /* hash by sessionid */
@@ -169,6 +180,9 @@ struct nfsd4_session {
        struct nfs4_sessionid   se_sessionid;
        struct nfsd4_channel_attrs se_fchannel;
        struct nfsd4_channel_attrs se_bchannel;
+        struct list_head        se_conns;
+        u32                     se_cb_prog;
+        u32                     se_cb_seq_nr;
        struct nfsd4_slot       *se_slots[];    /* forward channel slots */
 };
@@ -221,24 +235,36 @@ struct nfs4_client {
        clientid_t              cl_clientid;    /* generated by server */
        nfs4_verifier           cl_confirm;     /* generated by server */
        u32                     cl_firststate;  /* recovery dir creation */
+        u32                     cl_minorversion;
        /* for v4.0 and v4.1 callbacks: */
        struct nfs4_cb_conn     cl_cb_conn;
+#define NFSD4_CLIENT_CB_UPDATE  1
+#define NFSD4_CLIENT_KILL       2
+        unsigned long           cl_cb_flags;
        struct rpc_clnt         *cl_cb_client;
-        atomic_t                cl_cb_set;
+        u32                     cl_cb_ident;
+#define NFSD4_CB_UP             0
+#define NFSD4_CB_UNKNOWN        1
+#define NFSD4_CB_DOWN           2
+        int                     cl_cb_state;
+        struct nfsd4_callback   cl_cb_null;
+        struct nfsd4_session    *cl_cb_session;
+        struct list_head        cl_callbacks; /* list of in-progress callbacks */
+        /* for all client information that callback code might need: */
+        spinlock_t              cl_lock;
        /* for nfs41 */
        struct list_head        cl_sessions;
        struct nfsd4_clid_slot  cl_cs_slot;     /* create_session slot */
        u32                     cl_exchange_flags;
-        struct nfs4_sessionid   cl_sessionid;
        /* number of rpc's in progress over an associated session: */
        atomic_t                cl_refcount;
        /* for nfs41 callbacks */
        /* We currently support a single back channel with a single slot */
        unsigned long           cl_cb_slot_busy;
-        u32                     cl_cb_seq_nr;
        struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
                                                /* wait here for slots */
 };
@@ -440,12 +466,14 @@ extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
-extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
+extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
-extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
+extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
 extern void nfsd4_init_recdir(char *recdir_name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 661a6cf8e826..641117f2188d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
-#define MSNFS   /* HACK HACK */
 /*
 * File operations used by nfsd. Some of these have been ripped from
 * other parts of the kernel because they weren't exported, others
@@ -35,8 +34,8 @@
 #endif /* CONFIG_NFSD_V3 */
 #ifdef CONFIG_NFSD_V4
-#include <linux/nfs4_acl.h>
+#include "acl.h"
-#include <linux/nfsd_idmap.h>
+#include "idmap.h"
 #endif /* CONFIG_NFSD_V4 */
 #include "nfsd.h"
@@ -88,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
                            .dentry = dget(dentry)};
        int err = 0;
-        while (d_mountpoint(path.dentry) && follow_down(&path))
+        err = follow_down(&path, false);
-                ;
+        if (err < 0)
+                goto out;
        exp2 = rqst_exp_get_by_name(rqstp, &path);
        if (IS_ERR(exp2)) {
@@ -273,6 +273,13 @@ out:
        return err;
 }
+static int nfsd_break_lease(struct inode *inode)
+{
+        if (!S_ISREG(inode->i_mode))
+                return 0;
+        return break_lease(inode, O_WRONLY | O_NONBLOCK);
+}
 /*
 * Commit metadata changes to stable storage.
 */
@@ -281,23 +288,13 @@ commit_metadata(struct svc_fh *fhp)
 {
        struct inode *inode = fhp->fh_dentry->d_inode;
        const struct export_operations *export_ops = inode->i_sb->s_export_op;
-        int error = 0;
        if (!EX_ISSYNC(fhp->fh_export))
                return 0;
-        if (export_ops->commit_metadata) {
+        if (export_ops->commit_metadata)
-                error = export_ops->commit_metadata(inode);
+                return export_ops->commit_metadata(inode);
-        } else {
+        return sync_inode_metadata(inode, 1);
-                struct writeback_control wbc = {
-                        .sync_mode = WB_SYNC_ALL,
-                        .nr_to_write = 0, /* metadata only */
-                };
-                error = sync_inode(inode, &wbc);
-        }
-        return error;
 }
 /*
@@ -385,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                                goto out;
                }
-                /*
-                 * If we are changing the size of the file, then
-                 * we need to break all leases.
-                 */
-                host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
-                if (host_err == -EWOULDBLOCK)
-                        host_err = -ETIMEDOUT;
-                if (host_err) /* ENOMEM or EWOULDBLOCK */
-                        goto out_nfserr;
                host_err = get_write_access(inode);
                if (host_err)
                        goto out_nfserr;
@@ -435,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        err = nfserr_notsync;
        if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
+                host_err = nfsd_break_lease(inode);
+                if (host_err)
+                        goto out_nfserr;
                fh_lock(fhp);
                host_err = notify_change(dentry, iap);
                err = nfserrno(host_err);
                fh_unlock(fhp);
@@ -762,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         */
        if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
                host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
-        if (host_err == -EWOULDBLOCK)
-                host_err = -ETIMEDOUT;
        if (host_err) /* NOMEM or WOULDBLOCK */
                goto out_nfserr;
@@ -855,11 +844,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
        struct page *page = buf->page;
        size_t size;
-        int ret;
-        ret = buf->ops->confirm(pipe, buf);
-        if (unlikely(ret))
-                return ret;
        size = sd->len;
@@ -889,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
        return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
-static inline int svc_msnfs(struct svc_fh *ffhp)
-{
-#ifdef MSNFS
-        return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
-#else
-        return 0;
-#endif
-}
 static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
              loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -910,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        err = nfserr_perm;
        inode = file->f_path.dentry->d_inode;
-        if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
-                goto out;
        if (file->f_op->splice_read && rqstp->rq_splice_ok) {
                struct splice_desc sd = {
                        .len            = 0,
@@ -937,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                fsnotify_access(file);
        } else 
                err = nfserrno(host_err);
-out:
        return err;
 }
@@ -1002,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        int                     stable = *stablep;
        int                     use_wgather;
-#ifdef MSNFS
-        err = nfserr_perm;
-        if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-                (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
-                goto out;
-#endif
        dentry = file->f_path.dentry;
        inode = dentry->d_inode;
        exp   = fhp->fh_export;
@@ -1060,7 +1023,6 @@ out_nfserr:
                err = 0;
        else
                err = nfserrno(host_err);
-out:
        return err;
 }
@@ -1680,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                err = nfserrno(host_err);
                goto out_dput;
        }
+        err = nfserr_noent;
+        if (!dold->d_inode)
+                goto out_drop_write;
+        host_err = nfsd_break_lease(dold->d_inode);
+        if (host_err)
+                goto out_drop_write;
        host_err = vfs_link(dold, dirp, dnew);
        if (!host_err) {
                err = nfserrno(commit_metadata(ffhp));
@@ -1691,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                else
                        err = nfserrno(host_err);
        }
+out_drop_write:
        mnt_drop_write(tfhp->fh_export->ex_path.mnt);
 out_dput:
        dput(dnew);
@@ -1765,13 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (ndentry == trap)
                goto out_dput_new;
-        if (svc_msnfs(ffhp) &&
-                ((atomic_read(&odentry->d_count) > 1)
-                 || (atomic_read(&ndentry->d_count) > 1))) {
-                        host_err = -EPERM;
-                        goto out_dput_new;
-        }
        host_err = -EXDEV;
        if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
                goto out_dput_new;
@@ -1779,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (host_err)
                goto out_dput_new;
+        host_err = nfsd_break_lease(odentry->d_inode);
+        if (host_err)
+                goto out_drop_write;
        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
        if (!host_err) {
                host_err = commit_metadata(tfhp);
                if (!host_err)
                        host_err = commit_metadata(ffhp);
        }
+out_drop_write:
        mnt_drop_write(ffhp->fh_export->ex_path.mnt);
 out_dput_new:
        dput(ndentry);
 out_dput_old:
@@ -1850,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (host_err)
                goto out_nfserr;
-        if (type != S_IFDIR) { /* It's UNLINK */
+        host_err = nfsd_break_lease(rdentry->d_inode);
-#ifdef MSNFS
+        if (host_err)
-                if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+                goto out_put;
-                        (atomic_read(&rdentry->d_count) > 1)) {
+        if (type != S_IFDIR)
-                        host_err = -EPERM;
-                } else
-#endif
                host_err = vfs_unlink(dirp, rdentry);
-        } else { /* It's RMDIR */
+        else
                host_err = vfs_rmdir(dirp, rdentry);
-        }
+out_put:
        dput(rdentry);
        if (!host_err)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4d476ff08ae6..366401e1a536 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
        struct svc_export *si_exp;                      /* response */
 };
+struct nfsd4_secinfo_no_name {
+        u32 sin_style;                                  /* request */
+        struct svc_export *sin_exp;                     /* response */
+};
 struct nfsd4_setattr {
        stateid_t       sa_stateid;         /* request */
        u32             sa_bmval[3];        /* request */
@@ -373,8 +378,8 @@ struct nfsd4_sequence {
        u32                     cachethis;              /* request */
 #if 0
        u32                     target_maxslots;        /* response */
-        u32                     status_flags;           /* response */
 #endif /* not yet */
+        u32                     status_flags;           /* response */
 };
 struct nfsd4_destroy_session {
@@ -422,6 +427,7 @@ struct nfsd4_op {
                /* NFSv4.1 */
                struct nfsd4_exchange_id        exchange_id;
+                struct nfsd4_bind_conn_to_session bind_conn_to_session;
                struct nfsd4_create_session     create_session;
                struct nfsd4_destroy_session    destroy_session;
                struct nfsd4_sequence           sequence;
@@ -484,18 +490,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
 static inline void
 set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
 {
-        BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
+        BUG_ON(!fhp->fh_pre_saved);
-        cinfo->atomic = 1;
+        cinfo->atomic = fhp->fh_post_saved;
        cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
-        if (cinfo->change_supported) {
-                cinfo->before_change = fhp->fh_pre_change;
+        cinfo->before_change = fhp->fh_pre_change;
-                cinfo->after_change = fhp->fh_post_change;
+        cinfo->after_change = fhp->fh_post_change;
-        } else {
+        cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
-                cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+        cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
-                cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+        cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
-                cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+        cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
-                cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
-        }
 }
 int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
@@ -519,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
                struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
 extern __be32 nfsd4_create_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_create_session *);
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index df3e62c1ddc5..85c98737a146 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
 nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
        btnode.o bmap.o btree.o direct.o dat.o recovery.o \
        the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
-        ifile.o alloc.o gcinode.o ioctl.o gcdat.o
+        ifile.o alloc.o gcinode.o ioctl.o
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3dbdc1d356bf..3ee67c67cc52 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -35,7 +35,20 @@
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
 {
-        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+        return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
+}
+static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
+                                     const char *fname, int err)
+{
+        struct inode *inode = bmap->b_inode;
+        if (err == -EINVAL) {
+                nilfs_error(inode->i_sb, fname,
+                            "broken bmap (inode number=%lu)\n", inode->i_ino);
+                err = -EIO;
+        }
+        return err;
 }
 /**
@@ -66,8 +79,10 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
-        if (ret < 0)
+        if (ret < 0) {
+                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
                goto out;
+        }
        if (NILFS_BMAP_USE_VBN(bmap)) {
                ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
                                          &blocknr);
@@ -88,7 +103,8 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
        up_read(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -144,7 +160,8 @@ int nilfs_bmap_insert(struct nilfs_bmap *bmap,
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_insert(bmap, key, rec);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
@@ -180,9 +197,12 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
-        if (!ret)
-                *key = lastkey;
        up_read(&bmap->b_sem);
+        if (ret < 0)
+                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+        else
+                *key = lastkey;
        return ret;
 }
@@ -210,7 +230,8 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_delete(bmap, key);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
@@ -261,7 +282,8 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_truncate(bmap, key);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -300,7 +322,8 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_propagate(bmap, bh);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -344,7 +367,8 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap,
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -373,7 +397,8 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_mark(bmap, key, level);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -533,18 +558,20 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
        nilfs_btree_init_gc(bmap);
 }
-void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+void nilfs_bmap_save(const struct nilfs_bmap *bmap,
+                     struct nilfs_bmap_store *store)
 {
-        memcpy(gcbmap, bmap, sizeof(*bmap));
+        memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
-        init_rwsem(&gcbmap->b_sem);
+        store->last_allocated_key = bmap->b_last_allocated_key;
-        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+        store->last_allocated_ptr = bmap->b_last_allocated_ptr;
-        gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
+        store->state = bmap->b_state;
 }
-void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+void nilfs_bmap_restore(struct nilfs_bmap *bmap,
+                        const struct nilfs_bmap_store *store)
 {
-        memcpy(bmap, gcbmap, sizeof(*bmap));
+        memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
-        init_rwsem(&bmap->b_sem);
+        bmap->b_last_allocated_key = store->last_allocated_key;
-        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+        bmap->b_last_allocated_ptr = store->last_allocated_ptr;
-        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+        bmap->b_state = store->state;
 }
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index a20569b19929..bde1c0aa2e15 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -135,6 +135,12 @@ struct nilfs_bmap {
 /* state */
 #define NILFS_BMAP_DIRTY        0x00000001
+struct nilfs_bmap_store {
+        __le64 data[NILFS_BMAP_SIZE / sizeof(__le64)];
+        __u64 last_allocated_key;
+        __u64 last_allocated_ptr;
+        int state;
+};
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
@@ -153,9 +159,9 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
 int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
 void nilfs_bmap_init_gc(struct nilfs_bmap *);
-void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
-void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
+void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
 static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
                                    __u64 *ptr)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index f78ab1044d1d..388e9e8f5286 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -37,15 +37,7 @@
 void nilfs_btnode_cache_init_once(struct address_space *btnc)
 {
-        memset(btnc, 0, sizeof(*btnc));
+        nilfs_mapping_init_once(btnc);
-        INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
-        spin_lock_init(&btnc->tree_lock);
-        INIT_LIST_HEAD(&btnc->private_list);
-        spin_lock_init(&btnc->private_lock);
-        spin_lock_init(&btnc->i_mmap_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
-        INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
 }
 static const struct address_space_operations def_btnode_aops = {
@@ -55,12 +47,7 @@ static const struct address_space_operations def_btnode_aops = {
 void nilfs_btnode_cache_init(struct address_space *btnc,
                             struct backing_dev_info *bdi)
 {
-        btnc->host = NULL;  /* can safely set to host inode ? */
+        nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
-        btnc->flags = 0;
-        mapping_set_gfp_mask(btnc, GFP_NOFS);
-        btnc->assoc_mapping = NULL;
-        btnc->backing_dev_info = bdi;
-        btnc->a_ops = &def_btnode_aops;
 }
 void nilfs_btnode_cache_clear(struct address_space *btnc)
@@ -117,8 +104,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
        if (pblocknr == 0) {
                pblocknr = blocknr;
                if (inode->i_ino != NILFS_DAT_INO) {
-                        struct inode *dat =
+                        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
-                                nilfs_dat_inode(NILFS_I_NILFS(inode));
                        /* blocknr is a virtual block number */
                        err = nilfs_dat_translate(dat, blocknr, &pblocknr);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 18737818db63..5ff15a8a1024 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -863,26 +863,19 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 */
 int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 {
-        struct the_nilfs *nilfs;
        int ret;
-        nilfs = NILFS_MDT(cpfile)->mi_nilfs;
        switch (mode) {
        case NILFS_CHECKPOINT:
-                /*
+                if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
-                 * Check for protecting existing snapshot mounts:
+                        /*
-                 * ns_mount_mutex is used to make this operation atomic and
+                         * Current implementation does not have to protect
-                 * exclusive with a new mount job.  Though it doesn't cover
+                         * plain read-only mounts since they are exclusive
-                 * umount, it's enough for the purpose.
+                         * with a read/write mount and are protected from the
-                 */
+                         * cleaner.
-                if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
+                         */
-                        /* Current implementation does not have to protect
-                           plain read-only mounts since they are exclusive
-                           with a read/write mount and are protected from the
-                           cleaner. */
                        ret = -EBUSY;
-                } else
+                else
                        ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
                return ret;
        case NILFS_SNAPSHOT:
@@ -933,27 +926,40 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
 }
 /**
- * nilfs_cpfile_read - read cpfile inode
+ * nilfs_cpfile_read - read or get cpfile inode
- * @cpfile: cpfile inode
+ * @sb: super block instance
- * @raw_inode: on-disk cpfile inode
- */
-int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
-{
-        return nilfs_read_inode_common(cpfile, raw_inode);
-}
-/**
- * nilfs_cpfile_new - create cpfile
- * @nilfs: nilfs object
 * @cpsize: size of a checkpoint entry
+ * @raw_inode: on-disk cpfile inode
+ * @inodep: buffer to store the inode
 */
-struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize)
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
+                      struct nilfs_inode *raw_inode, struct inode **inodep)
 {
        struct inode *cpfile;
+        int err;
+        cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
+        if (unlikely(!cpfile))
+                return -ENOMEM;
+        if (!(cpfile->i_state & I_NEW))
+                goto out;
+        err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
+        if (err)
+                goto failed;
-        cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0);
+        nilfs_mdt_set_entry_size(cpfile, cpsize,
-        if (cpfile)
+                                 sizeof(struct nilfs_cpfile_header));
-                nilfs_mdt_set_entry_size(cpfile, cpsize,
-                                         sizeof(struct nilfs_cpfile_header));
+        err = nilfs_read_inode_common(cpfile, raw_inode);
-        return cpfile;
+        if (err)
+                goto failed;
+        unlock_new_inode(cpfile);
+ out:
+        *inodep = cpfile;
+        return 0;
+ failed:
+        iget_failed(cpfile);
+        return err;
 }
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index bc0809e0ab43..a242b9a314f9 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,7 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
                                size_t);
-int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode);
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
-struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize);
+                      struct nilfs_inode *raw_inode, struct inode **inodep);
 #endif  /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 013146755683..59e5fe742f7b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -36,6 +36,7 @@
 struct nilfs_dat_info {
        struct nilfs_mdt_info mi;
        struct nilfs_palloc_cache palloc_cache;
+        struct nilfs_shadow_map shadow;
 };
 static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
@@ -102,7 +103,8 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
        nilfs_palloc_abort_alloc_entry(dat, req);
 }
-void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
+static void nilfs_dat_commit_free(struct inode *dat,
+                                  struct nilfs_palloc_req *req)
 {
        struct nilfs_dat_entry *entry;
        void *kaddr;
@@ -327,6 +329,23 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
        ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
        if (ret < 0)
                return ret;
+        /*
+         * The given disk block number (blocknr) is not yet written to
+         * the device at this point.
+         *
+         * To prevent nilfs_dat_translate() from returning the
+         * uncommitted block number, this makes a copy of the entry
+         * buffer and redirects nilfs_dat_translate() to the copy.
+         */
+        if (!buffer_nilfs_redirected(entry_bh)) {
+                ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
+                if (ret) {
+                        brelse(entry_bh);
+                        return ret;
+                }
+        }
        kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
        entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
        if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
@@ -371,7 +390,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 */
 int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 {
-        struct buffer_head *entry_bh;
+        struct buffer_head *entry_bh, *bh;
        struct nilfs_dat_entry *entry;
        sector_t blocknr;
        void *kaddr;
@@ -381,6 +400,15 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
        if (ret < 0)
                return ret;
+        if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
+                bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
+                if (bh) {
+                        WARN_ON(!buffer_uptodate(bh));
+                        brelse(entry_bh);
+                        entry_bh = bh;
+                }
+        }
        kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
        entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
        blocknr = le64_to_cpu(entry->de_blocknr);
@@ -436,38 +464,48 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
 }
 /**
- * nilfs_dat_read - read dat inode
+ * nilfs_dat_read - read or get dat inode
- * @dat: dat inode
+ * @sb: super block instance
- * @raw_inode: on-disk dat inode
- */
-int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
-{
-        return nilfs_read_inode_common(dat, raw_inode);
-}
-/**
- * nilfs_dat_new - create dat file
- * @nilfs: nilfs object
 * @entry_size: size of a dat entry
+ * @raw_inode: on-disk dat inode
+ * @inodep: buffer to store the inode
 */
-struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
+                   struct nilfs_inode *raw_inode, struct inode **inodep)
 {
        static struct lock_class_key dat_lock_key;
        struct inode *dat;
        struct nilfs_dat_info *di;
        int err;
-        dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di));
+        dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
-        if (dat) {
+        if (unlikely(!dat))
-                err = nilfs_palloc_init_blockgroup(dat, entry_size);
+                return -ENOMEM;
-                if (unlikely(err)) {
+        if (!(dat->i_state & I_NEW))
-                        nilfs_mdt_destroy(dat);
+                goto out;
-                        return NULL;
-                }
-                di = NILFS_DAT_I(dat);
+        err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
-                lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+        if (err)
-                nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+                goto failed;
-        }
-        return dat;
+        err = nilfs_palloc_init_blockgroup(dat, entry_size);
+        if (err)
+                goto failed;
+        di = NILFS_DAT_I(dat);
+        lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+        nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+        nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+        err = nilfs_read_inode_common(dat, raw_inode);
+        if (err)
+                goto failed;
+        unlock_new_inode(dat);
+ out:
+        *inodep = dat;
+        return 0;
+ failed:
+        iget_failed(dat);
+        return err;
 }
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d31c3aab0efe..cbd8e9732503 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,7 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
 ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
-int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode);
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
-struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size);
+                   struct nilfs_inode *raw_inode, struct inode **inodep);
 #endif  /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index cb003c8ee1f6..9d45773b79e6 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -91,7 +91,6 @@ static void nilfs_commit_chunk(struct page *page,
                               unsigned from, unsigned to)
 {
        struct inode *dir = mapping->host;
-        struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
        loff_t pos = page_offset(page) + from;
        unsigned len = to - from;
        unsigned nr_dirty, copied;
@@ -103,7 +102,7 @@ static void nilfs_commit_chunk(struct page *page,
                i_size_write(dir, pos + copied);
        if (IS_DIRSYNC(dir))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
-        err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+        err = nilfs_set_file_dirty(dir, nr_dirty);
        WARN_ON(err); /* do not happen */
        unlock_page(page);
 }
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
new file mode 100644
index 000000000000..a71cc412b651
--- /dev/null
+++ b/fs/nilfs2/export.h
@@ -0,0 +1,17 @@
+#ifndef NILFS_EXPORT_H
+#define NILFS_EXPORT_H
+#include <linux/exportfs.h>
+extern const struct export_operations nilfs_export_ops;
+struct nilfs_fid {
+        u64 cno;
+        u64 ino;
+        u32 gen;
+        u32 parent_gen;
+        u64 parent_ino;
+} __attribute__ ((packed));
+#endif
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c9a30d7ff6fc..2f560c9fb808 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -155,6 +155,7 @@ const struct inode_operations nilfs_file_inode_operations = {
        .truncate       = nilfs_truncate,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
+        .fiemap         = nilfs_fiemap,
 };
 /* end of file */
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
deleted file mode 100644
index 84a45d1d5464..000000000000
--- a/fs/nilfs2/gcdat.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * gcdat.c - NILFS shadow DAT inode for GC
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
- *            and Ryusuke Konishi <ryusuke@osrg.net>.
- *
- */
-#include <linux/buffer_head.h>
-#include "nilfs.h"
-#include "page.h"
-#include "mdt.h"
-int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
-{
-        struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
-        struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
-        int err;
-        gcdat->i_state = 0;
-        gcdat->i_blocks = dat->i_blocks;
-        gii->i_flags = dii->i_flags;
-        gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
-        gii->i_cno = 0;
-        nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
-        err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
-        if (unlikely(err))
-                return err;
-        return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
-                                      &dii->i_btnode_cache);
-}
-void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
-{
-        struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
-        struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
-        struct address_space *mapping = dat->i_mapping;
-        struct address_space *gmapping = gcdat->i_mapping;
-        down_write(&NILFS_MDT(dat)->mi_sem);
-        dat->i_blocks = gcdat->i_blocks;
-        dii->i_flags = gii->i_flags;
-        dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
-        nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
-        nilfs_palloc_clear_cache(dat);
-        nilfs_palloc_clear_cache(gcdat);
-        nilfs_clear_dirty_pages(mapping);
-        nilfs_copy_back_pages(mapping, gmapping);
-        /* note: mdt dirty flags should be cleared by segctor. */
-        nilfs_clear_dirty_pages(&dii->i_btnode_cache);
-        nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
-        up_write(&NILFS_MDT(dat)->mi_sem);
-}
-void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
-{
-        struct inode *gcdat = nilfs->ns_gc_dat;
-        struct nilfs_inode_info *gii = NILFS_I(gcdat);
-        gcdat->i_state = I_FREEING | I_CLEAR;
-        gii->i_flags = 0;
-        nilfs_palloc_clear_cache(gcdat);
-        truncate_inode_pages(gcdat->i_mapping, 0);
-        truncate_inode_pages(&gii->i_btnode_cache, 0);
-}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index bed3a783129b..caf9a6a3fb54 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,13 +28,6 @@
 * gcinodes), and this file provides lookup function of the dummy
 * inodes and their buffer read function.
 *
- * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
- * has to treat blocks that belong to a same file but have different
- * checkpoint numbers.  To avoid interference among generations, dummy
- * inodes are managed separately from actual inodes, and their lookup
- * function (nilfs_gc_iget) is designed to be specified with a
- * checkpoint number argument as well as an inode number.
- *
 * Buffers and pages held by the dummy inodes will be released each
 * time after they are copied to a new log.  Dirty blocks made on the
 * current generation and the blocks to be moved by GC never overlap
@@ -175,125 +168,37 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
                }
                nilfs_btnode_mark_dirty(bh);
        } else {
-                nilfs_mdt_mark_buffer_dirty(bh);
+                nilfs_mark_buffer_dirty(bh);
        }
        return 0;
 }
-/*
+int nilfs_init_gcinode(struct inode *inode)
- * nilfs_init_gccache() - allocate and initialize gc_inode hash table
- * @nilfs - the_nilfs
- *
- * Return Value: On success, 0.
- * On error, a negative error code is returned.
- */
-int nilfs_init_gccache(struct the_nilfs *nilfs)
 {
-        int loop;
+        struct nilfs_inode_info *ii = NILFS_I(inode);
-        BUG_ON(nilfs->ns_gc_inodes_h);
-        INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
-        nilfs->ns_gc_inodes_h =
-                kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
-                        GFP_NOFS);
-        if (nilfs->ns_gc_inodes_h == NULL)
-                return -ENOMEM;
-        for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
-                INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
-        return 0;
-}
-/*
- * nilfs_destroy_gccache() - free gc_inode hash table
- * @nilfs - the nilfs
- */
-void nilfs_destroy_gccache(struct the_nilfs *nilfs)
-{
-        if (nilfs->ns_gc_inodes_h) {
-                nilfs_remove_all_gcinode(nilfs);
-                kfree(nilfs->ns_gc_inodes_h);
-                nilfs->ns_gc_inodes_h = NULL;
-        }
-}
-static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
-                                   __u64 cno)
-{
-        struct inode *inode;
-        struct nilfs_inode_info *ii;
-        inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
-        if (!inode)
-                return NULL;
-        inode->i_op = NULL;
+        inode->i_mode = S_IFREG;
-        inode->i_fop = NULL;
+        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
        inode->i_mapping->a_ops = &def_gcinode_aops;
+        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
-        ii = NILFS_I(inode);
-        ii->i_cno = cno;
        ii->i_flags = 0;
-        ii->i_state = 1 << NILFS_I_GCINODE;
-        ii->i_bh = NULL;
        nilfs_bmap_init_gc(ii->i_bmap);
-        return inode;
+        return 0;
-}
-static unsigned long ihash(ino_t ino, __u64 cno)
-{
-        return hash_long((unsigned long)((ino << 2) + cno),
-                         NILFS_GCINODE_HASH_BITS);
-}
-/*
- * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
- */
-struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
-{
-        struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
-        struct hlist_node *node;
-        struct inode *inode;
-        hlist_for_each_entry(inode, node, head, i_hash) {
-                if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
-                        return inode;
-        }
-        inode = alloc_gcinode(nilfs, ino, cno);
-        if (likely(inode)) {
-                hlist_add_head(&inode->i_hash, head);
-                list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
-        }
-        return inode;
-}
-/*
- * nilfs_clear_gcinode() - clear and free a gc inode
- */
-void nilfs_clear_gcinode(struct inode *inode)
-{
-        nilfs_mdt_destroy(inode);
 }
-/*
+/**
- * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
+ * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
 */
-void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
 {
-        struct hlist_head *head = nilfs->ns_gc_inodes_h;
+        struct list_head *head = &nilfs->ns_gc_inodes;
-        struct hlist_node *node, *n;
+        struct nilfs_inode_info *ii;
-        struct inode *inode;
-        int loop;
-        for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
+        while (!list_empty(head)) {
-                hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
+                ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
-                        hlist_del_init(&inode->i_hash);
+                list_del_init(&ii->i_dirty);
-                        list_del_init(&NILFS_I(inode)->i_dirty);
+                iput(&ii->vfs_inode);
-                        nilfs_clear_gcinode(inode); /* might sleep */
-                }
        }
 }
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 922d9dd42c8f..bfc73d3a30ed 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -149,37 +149,53 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
        }
        err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
-        if (unlikely(err)) {
+        if (unlikely(err))
-                if (err == -EINVAL)
+                nilfs_warning(sb, __func__, "unable to read inode: %lu",
-                        nilfs_error(sb, __func__, "ifile is broken");
+                              (unsigned long) ino);
-                else
-                        nilfs_warning(sb, __func__,
-                                      "unable to read inode: %lu",
-                                      (unsigned long) ino);
-        }
        return err;
 }
 /**
- * nilfs_ifile_new - create inode file
+ * nilfs_ifile_read - read or get ifile inode
- * @sbi: nilfs_sb_info struct
+ * @sb: super block instance
+ * @root: root object
 * @inode_size: size of an inode
+ * @raw_inode: on-disk ifile inode
+ * @inodep: buffer to store the inode
 */
-struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size)
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+                     size_t inode_size, struct nilfs_inode *raw_inode,
+                     struct inode **inodep)
 {
        struct inode *ifile;
        int err;
-        ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO,
+        ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
-                              sizeof(struct nilfs_ifile_info));
+        if (unlikely(!ifile))
-        if (ifile) {
+                return -ENOMEM;
-                err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+        if (!(ifile->i_state & I_NEW))
-                if (unlikely(err)) {
+                goto out;
-                        nilfs_mdt_destroy(ifile);
-                        return NULL;
+        err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
-                }
+                             sizeof(struct nilfs_ifile_info));
-                nilfs_palloc_setup_cache(ifile,
+        if (err)
-                                         &NILFS_IFILE_I(ifile)->palloc_cache);
+                goto failed;
-        }
-        return ifile;
+        err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+        if (err)
+                goto failed;
+        nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
+        err = nilfs_read_inode_common(ifile, raw_inode);
+        if (err)
+                goto failed;
+        unlock_new_inode(ifile);
+ out:
+        *inodep = ifile;
+        return 0;
+ failed:
+        iget_failed(ifile);
+        return err;
 }
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index cbca32e498f2..59b6f2b51df6 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
-struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size);
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+                     size_t inode_size, struct nilfs_inode *raw_inode,
+                     struct inode **inodep);
 #endif  /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index eccb2f2e2315..2fd440d8d6b8 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -34,6 +34,12 @@
 #include "cpfile.h"
 #include "ifile.h"
+struct nilfs_iget_args {
+        u64 ino;
+        __u64 cno;
+        struct nilfs_root *root;
+        int for_gc;
+};
 /**
 * nilfs_get_block() - get a file block on the filesystem (callback function)
@@ -52,7 +58,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
        struct nilfs_inode_info *ii = NILFS_I(inode);
        __u64 blknum = 0;
        int err = 0, ret;
-        struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
        unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
        down_read(&NILFS_MDT(dat)->mi_sem);
@@ -90,11 +96,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                                       inode->i_ino,
                                       (unsigned long long)blkoff);
                                err = 0;
-                        } else if (err == -EINVAL) {
-                                nilfs_error(inode->i_sb, __func__,
-                                            "broken bmap (inode=%lu)\n",
-                                            inode->i_ino);
-                                err = -EIO;
                        }
                        nilfs_transaction_abort(inode->i_sb);
                        goto out;
@@ -103,6 +104,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                nilfs_transaction_commit(inode->i_sb); /* never fails */
                /* Error handling should be detailed */
                set_buffer_new(bh_result);
+                set_buffer_delay(bh_result);
                map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
                                                      to proper value */
        } else if (ret == -ENOENT) {
@@ -179,10 +181,9 @@ static int nilfs_set_page_dirty(struct page *page)
        if (ret) {
                struct inode *inode = page->mapping->host;
-                struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
                unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
-                nilfs_set_file_dirty(sbi, inode, nr_dirty);
+                nilfs_set_file_dirty(inode, nr_dirty);
        }
        return ret;
 }
@@ -223,7 +224,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
                                                  start + copied);
        copied = generic_write_end(file, mapping, pos, len, copied, page,
                                   fsdata);
-        nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
+        nilfs_set_file_dirty(inode, nr_dirty);
        err = nilfs_transaction_commit(inode->i_sb);
        return err ? : copied;
 }
@@ -279,6 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct inode *inode;
        struct nilfs_inode_info *ii;
+        struct nilfs_root *root;
        int err = -ENOMEM;
        ino_t ino;
@@ -289,15 +291,17 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        mapping_set_gfp_mask(inode->i_mapping,
                             mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+        root = NILFS_I(dir)->i_root;
        ii = NILFS_I(inode);
        ii->i_state = 1 << NILFS_I_NEW;
+        ii->i_root = root;
-        err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
+        err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
        if (unlikely(err))
                goto failed_ifile_create_inode;
        /* reference count of i_bh inherits from nilfs_mdt_read_block() */
-        atomic_inc(&sbi->s_inodes_count);
+        atomic_inc(&root->inodes_count);
        inode_init_owner(inode, dir, mode);
        inode->i_ino = ino;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -320,7 +324,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        /* ii->i_file_acl = 0; */
        /* ii->i_dir_acl = 0; */
        ii->i_dir_start_lookup = 0;
-        ii->i_cno = 0;
        nilfs_set_inode_flags(inode);
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
@@ -350,16 +353,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        return ERR_PTR(err);
 }
-void nilfs_free_inode(struct inode *inode)
-{
-        struct super_block *sb = inode->i_sb;
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
-        /* XXX: check error code? Is there any thing I can do? */
-        (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
-        atomic_dec(&sbi->s_inodes_count);
-}
 void nilfs_set_inode_flags(struct inode *inode)
 {
        unsigned int flags = NILFS_I(inode)->i_flags;
@@ -410,7 +403,6 @@ int nilfs_read_inode_common(struct inode *inode,
                0 : le32_to_cpu(raw_inode->i_dir_acl);
 #endif
        ii->i_dir_start_lookup = 0;
-        ii->i_cno = 0;
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -424,21 +416,21 @@ int nilfs_read_inode_common(struct inode *inode,
        return 0;
 }
-static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
+static int __nilfs_read_inode(struct super_block *sb,
+                              struct nilfs_root *root, unsigned long ino,
                              struct inode *inode)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
-        struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
        struct buffer_head *bh;
        struct nilfs_inode *raw_inode;
        int err;
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
-        err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
+        err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
        if (unlikely(err))
                goto bad_inode;
-        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
+        raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
        err = nilfs_read_inode_common(inode, raw_inode);
        if (err)
@@ -461,33 +453,110 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
                        inode, inode->i_mode,
                        huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
        }
-        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+        nilfs_ifile_unmap_inode(root->ifile, ino, bh);
        brelse(bh);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        nilfs_set_inode_flags(inode);
        return 0;
 failed_unmap:
-        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+        nilfs_ifile_unmap_inode(root->ifile, ino, bh);
        brelse(bh);
 bad_inode:
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        return err;
 }
-struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
+static int nilfs_iget_test(struct inode *inode, void *opaque)
+{
+        struct nilfs_iget_args *args = opaque;
+        struct nilfs_inode_info *ii;
+        if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
+                return 0;
+        ii = NILFS_I(inode);
+        if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
+                return !args->for_gc;
+        return args->for_gc && args->cno == ii->i_cno;
+}
+static int nilfs_iget_set(struct inode *inode, void *opaque)
+{
+        struct nilfs_iget_args *args = opaque;
+        inode->i_ino = args->ino;
+        if (args->for_gc) {
+                NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
+                NILFS_I(inode)->i_cno = args->cno;
+                NILFS_I(inode)->i_root = NULL;
+        } else {
+                if (args->root && args->ino == NILFS_ROOT_INO)
+                        nilfs_get_root(args->root);
+                NILFS_I(inode)->i_root = args->root;
+        }
+        return 0;
+}
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+                            unsigned long ino)
+{
+        struct nilfs_iget_args args = {
+                .ino = ino, .root = root, .cno = 0, .for_gc = 0
+        };
+        return ilookup5(sb, ino, nilfs_iget_test, &args);
+}
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+                                unsigned long ino)
+{
+        struct nilfs_iget_args args = {
+                .ino = ino, .root = root, .cno = 0, .for_gc = 0
+        };
+        return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+}
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+                         unsigned long ino)
+{
+        struct inode *inode;
+        int err;
+        inode = nilfs_iget_locked(sb, root, ino);
+        if (unlikely(!inode))
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        err = __nilfs_read_inode(sb, root, ino, inode);
+        if (unlikely(err)) {
+                iget_failed(inode);
+                return ERR_PTR(err);
+        }
+        unlock_new_inode(inode);
+        return inode;
+}
+struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
+                                __u64 cno)
 {
+        struct nilfs_iget_args args = {
+                .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
+        };
        struct inode *inode;
        int err;
-        inode = iget_locked(sb, ino);
+        inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
        if (unlikely(!inode))
                return ERR_PTR(-ENOMEM);
        if (!(inode->i_state & I_NEW))
                return inode;
-        err = __nilfs_read_inode(sb, ino, inode);
+        err = nilfs_init_gcinode(inode);
        if (unlikely(err)) {
                iget_failed(inode);
                return ERR_PTR(err);
@@ -528,21 +597,20 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
 {
        ino_t ino = inode->i_ino;
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        struct super_block *sb = inode->i_sb;
+        struct inode *ifile = ii->i_root->ifile;
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct nilfs_inode *raw_inode;
-        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
+        raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
        if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
-                memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
+                memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
        set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
        nilfs_write_inode_common(inode, raw_inode, 0);
                /* XXX: call with has_bmap = 0 is a workaround to avoid
                   deadlock of bmap. This delays update of i_bmap to just
                   before writing */
-        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
+        nilfs_ifile_unmap_inode(ifile, ino, ibh);
 }
 #define NILFS_MAX_TRUNCATE_BLOCKS       16384  /* 64MB for 4KB block */
@@ -555,7 +623,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
                return;
- repeat:
+repeat:
        ret = nilfs_bmap_last_key(ii->i_bmap, &b);
        if (ret == -ENOENT)
                return;
@@ -572,14 +640,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
                     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
                goto repeat;
- failed:
+failed:
-        if (ret == -EINVAL)
+        nilfs_warning(ii->vfs_inode.i_sb, __func__,
-                nilfs_error(ii->vfs_inode.i_sb, __func__,
+                      "failed to truncate bmap (ino=%lu, err=%d)",
-                            "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
+                      ii->vfs_inode.i_ino, ret);
-        else
-                nilfs_warning(ii->vfs_inode.i_sb, __func__,
-                              "failed to truncate bmap (ino=%lu, err=%d)",
-                              ii->vfs_inode.i_ino, ret);
 }
 void nilfs_truncate(struct inode *inode)
@@ -608,7 +672,7 @@ void nilfs_truncate(struct inode *inode)
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
        nilfs_mark_inode_dirty(inode);
-        nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
+        nilfs_set_file_dirty(inode, 0);
        nilfs_transaction_commit(sb);
        /* May construct a logical segment and may fail in sync mode.
           But truncate has no return value. */
@@ -617,6 +681,7 @@ void nilfs_truncate(struct inode *inode)
 static void nilfs_clear_inode(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
        /*
         * Free resources allocated in nilfs_read_inode(), here.
@@ -625,10 +690,16 @@ static void nilfs_clear_inode(struct inode *inode)
        brelse(ii->i_bh);
        ii->i_bh = NULL;
+        if (mdi && mdi->mi_palloc_cache)
+                nilfs_palloc_destroy_cache(inode);
        if (test_bit(NILFS_I_BMAP, &ii->i_state))
                nilfs_bmap_clear(ii->i_bmap);
        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+        if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
+                nilfs_put_root(ii->i_root);
 }
 void nilfs_evict_inode(struct inode *inode)
@@ -637,7 +708,7 @@ void nilfs_evict_inode(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        if (inode->i_nlink || unlikely(is_bad_inode(inode))) {
+        if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
                if (inode->i_data.nrpages)
                        truncate_inode_pages(&inode->i_data, 0);
                end_writeback(inode);
@@ -649,12 +720,16 @@ void nilfs_evict_inode(struct inode *inode)
        if (inode->i_data.nrpages)
                truncate_inode_pages(&inode->i_data, 0);
+        /* TODO: some of the following operations may fail.  */
        nilfs_truncate_bmap(ii, 0);
        nilfs_mark_inode_dirty(inode);
        end_writeback(inode);
+        nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
+        atomic_dec(&ii->i_root->inodes_count);
        nilfs_clear_inode(inode);
-        nilfs_free_inode(inode);
-        /* nilfs_free_inode() marks inode buffer dirty */
        if (IS_SYNC(inode))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
        nilfs_transaction_commit(sb);
@@ -700,17 +775,32 @@ out_err:
        return err;
 }
-int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
-                           struct buffer_head **pbh)
+{
+        struct nilfs_root *root;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        root = NILFS_I(inode)->i_root;
+        if ((mask & MAY_WRITE) && root &&
+            root->cno != NILFS_CPTREE_CURRENT_CNO)
+                return -EROFS; /* snapshot is not writable */
+        return generic_permission(inode, mask, flags, NULL);
+}
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        int err;
        spin_lock(&sbi->s_inode_lock);
        if (ii->i_bh == NULL) {
                spin_unlock(&sbi->s_inode_lock);
-                err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
+                err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
-                                                  pbh);
+                                                  inode->i_ino, pbh);
                if (unlikely(err))
                        return err;
                spin_lock(&sbi->s_inode_lock);
@@ -743,9 +833,9 @@ int nilfs_inode_dirty(struct inode *inode)
        return ret;
 }
-int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
-                         unsigned nr_dirty)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
@@ -778,11 +868,10 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
 int nilfs_mark_inode_dirty(struct inode *inode)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct buffer_head *ibh;
        int err;
-        err = nilfs_load_inode_block(sbi, inode, &ibh);
+        err = nilfs_load_inode_block(inode, &ibh);
        if (unlikely(err)) {
                nilfs_warning(inode->i_sb, __func__,
                              "failed to reget inode block.\n");
@@ -790,7 +879,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
        }
        nilfs_update_inode(inode, ibh);
        nilfs_mdt_mark_buffer_dirty(ibh);
-        nilfs_mdt_mark_dirty(sbi->s_ifile);
+        nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
        brelse(ibh);
        return 0;
 }
@@ -808,6 +897,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
 void nilfs_dirty_inode(struct inode *inode)
 {
        struct nilfs_transaction_info ti;
+        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
        if (is_bad_inode(inode)) {
                nilfs_warning(inode->i_sb, __func__,
@@ -815,7 +905,142 @@ void nilfs_dirty_inode(struct inode *inode)
                dump_stack();
                return;
        }
+        if (mdi) {
+                nilfs_mdt_mark_dirty(inode);
+                return;
+        }
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
        nilfs_mark_inode_dirty(inode);
        nilfs_transaction_commit(inode->i_sb); /* never fails */
 }
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 __u64 start, __u64 len)
+{
+        struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
+        __u64 logical = 0, phys = 0, size = 0;
+        __u32 flags = 0;
+        loff_t isize;
+        sector_t blkoff, end_blkoff;
+        sector_t delalloc_blkoff;
+        unsigned long delalloc_blklen;
+        unsigned int blkbits = inode->i_blkbits;
+        int ret, n;
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
+                return ret;
+        mutex_lock(&inode->i_mutex);
+        isize = i_size_read(inode);
+        blkoff = start >> blkbits;
+        end_blkoff = (start + len - 1) >> blkbits;
+        delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
+                                                        &delalloc_blkoff);
+        do {
+                __u64 blkphy;
+                unsigned int maxblocks;
+                if (delalloc_blklen && blkoff == delalloc_blkoff) {
+                        if (size) {
+                                /* End of the current extent */
+                                ret = fiemap_fill_next_extent(
+                                        fieinfo, logical, phys, size, flags);
+                                if (ret)
+                                        break;
+                        }
+                        if (blkoff > end_blkoff)
+                                break;
+                        flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
+                        logical = blkoff << blkbits;
+                        phys = 0;
+                        size = delalloc_blklen << blkbits;
+                        blkoff = delalloc_blkoff + delalloc_blklen;
+                        delalloc_blklen = nilfs_find_uncommitted_extent(
+                                inode, blkoff, &delalloc_blkoff);
+                        continue;
+                }
+                /*
+                 * Limit the number of blocks that we look up so as
+                 * not to get into the next delayed allocation extent.
+                 */
+                maxblocks = INT_MAX;
+                if (delalloc_blklen)
+                        maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
+                                          maxblocks);
+                blkphy = 0;
+                down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+                n = nilfs_bmap_lookup_contig(
+                        NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
+                up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+                if (n < 0) {
+                        int past_eof;
+                        if (unlikely(n != -ENOENT))
+                                break; /* error */
+                        /* HOLE */
+                        blkoff++;
+                        past_eof = ((blkoff << blkbits) >= isize);
+                        if (size) {
+                                /* End of the current extent */
+                                if (past_eof)
+                                        flags |= FIEMAP_EXTENT_LAST;
+                                ret = fiemap_fill_next_extent(
+                                        fieinfo, logical, phys, size, flags);
+                                if (ret)
+                                        break;
+                                size = 0;
+                        }
+                        if (blkoff > end_blkoff || past_eof)
+                                break;
+                } else {
+                        if (size) {
+                                if (phys && blkphy << blkbits == phys + size) {
+                                        /* The current extent goes on */
+                                        size += n << blkbits;
+                                } else {
+                                        /* Terminate the current extent */
+                                        ret = fiemap_fill_next_extent(
+                                                fieinfo, logical, phys, size,
+                                                flags);
+                                        if (ret || blkoff > end_blkoff)
+                                                break;
+                                        /* Start another extent */
+                                        flags = FIEMAP_EXTENT_MERGED;
+                                        logical = blkoff << blkbits;
+                                        phys = blkphy << blkbits;
+                                        size = n << blkbits;
+                                }
+                        } else {
+                                /* Start a new extent */
+                                flags = FIEMAP_EXTENT_MERGED;
+                                logical = blkoff << blkbits;
+                                phys = blkphy << blkbits;
+                                size = n << blkbits;
+                        }
+                        blkoff += n;
+                }
+                cond_resched();
+        } while (true);
+        /* If ret is 1 then we just hit the end of the extent array */
+        if (ret == 1)
+                ret = 0;
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f90a33d9a5b0..496738963fdb 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -22,7 +22,6 @@
 #include <linux/fs.h>
 #include <linux/wait.h>
-#include <linux/smp_lock.h>     /* lock_kernel(), unlock_kernel() */
 #include <linux/slab.h>
 #include <linux/capability.h>   /* capable() */
 #include <linux/uaccess.h>      /* copy_from_user(), copy_to_user() */
@@ -118,7 +117,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
        if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
                goto out;
-        mutex_lock(&nilfs->ns_mount_mutex);
+        down_read(&inode->i_sb->s_umount);
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
        ret = nilfs_cpfile_change_cpmode(
@@ -128,7 +127,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
        else
                nilfs_transaction_commit(inode->i_sb); /* never fails */
-        mutex_unlock(&nilfs->ns_mount_mutex);
+        up_read(&inode->i_sb->s_umount);
 out:
        mnt_drop_write(filp->f_path.mnt);
        return ret;
@@ -234,7 +233,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
+        ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -243,8 +242,7 @@ static ssize_t
 nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
                          void *buf, size_t size, size_t nmembs)
 {
-        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
-        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
        struct nilfs_bdesc *bdescs = buf;
        int ret, i;
@@ -334,10 +332,11 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
        return 0;
 }
-static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
+static int nilfs_ioctl_move_blocks(struct super_block *sb,
                                   struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
        struct inode *inode;
        struct nilfs_vdesc *vdesc;
        struct buffer_head *bh, *n;
@@ -349,19 +348,34 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
        for (i = 0, vdesc = buf; i < nmembs; ) {
                ino = vdesc->vd_ino;
                cno = vdesc->vd_cno;
-                inode = nilfs_gc_iget(nilfs, ino, cno);
+                inode = nilfs_iget_for_gc(sb, ino, cno);
-                if (unlikely(inode == NULL)) {
+                if (IS_ERR(inode)) {
-                        ret = -ENOMEM;
+                        ret = PTR_ERR(inode);
                        goto failed;
                }
+                if (list_empty(&NILFS_I(inode)->i_dirty)) {
+                        /*
+                         * Add the inode to GC inode list. Garbage Collection
+                         * is serialized and no two processes manipulate the
+                         * list simultaneously.
+                         */
+                        igrab(inode);
+                        list_add(&NILFS_I(inode)->i_dirty,
+                                 &nilfs->ns_gc_inodes);
+                }
                do {
                        ret = nilfs_ioctl_move_inode_block(inode, vdesc,
                                                           &buffers);
-                        if (unlikely(ret < 0))
+                        if (unlikely(ret < 0)) {
+                                iput(inode);
                                goto failed;
+                        }
                        vdesc++;
                } while (++i < nmembs &&
                         vdesc->vd_ino == ino && vdesc->vd_cno == cno);
+                iput(inode); /* The inode still remains in GC inode list */
        }
        list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
@@ -406,7 +420,7 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
        size_t nmembs = argv->v_nmembs;
        int ret;
-        ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
+        ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
        return (ret < 0) ? ret : nmembs;
 }
@@ -415,8 +429,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                                         struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
-        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
-        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
        struct nilfs_bdesc *bdescs = buf;
        int ret, i;
@@ -435,7 +448,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                        /* skip dead block */
                        continue;
                if (bdescs[i].bd_level == 0) {
-                        ret = nilfs_mdt_mark_block_dirty(dat,
+                        ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
                                                         bdescs[i].bd_offset);
                        if (ret < 0) {
                                WARN_ON(ret == -ENOENT);
@@ -567,7 +580,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
        }
        /*
-         * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
+         * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(),
         * which will operates an inode list without blocking.
         * To protect the list from concurrent operations,
         * nilfs_ioctl_move_blocks should be atomic operation.
@@ -577,15 +590,16 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                goto out_free;
        }
-        ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
        if (ret < 0)
                printk(KERN_ERR "NILFS: GC failed during preparation: "
                        "cannot read source blocks: err=%d\n", ret);
        else
                ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
-        if (ret < 0)
+        nilfs_remove_all_gcinodes(nilfs);
-                nilfs_remove_all_gcinode(nilfs);
        clear_nilfs_gc_running(nilfs);
 out_free:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index d01aff4957d9..6a0e2a189f60 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -36,7 +36,6 @@
 #define NILFS_MDT_MAX_RA_BLOCKS         (16 - 1)
-#define INIT_UNUSED_INODE_FIELDS
 static int
 nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
@@ -78,25 +77,11 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
                                                     struct buffer_head *,
                                                     void *))
 {
-        struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
        struct super_block *sb = inode->i_sb;
        struct nilfs_transaction_info ti;
        struct buffer_head *bh;
        int err;
-        if (!sb) {
-                /*
-                 * Make sure this function is not called from any
-                 * read-only context.
-                 */
-                if (!nilfs->ns_writer) {
-                        WARN_ON(1);
-                        err = -EROFS;
-                        goto out;
-                }
-                sb = nilfs->ns_writer->s_super;
-        }
        nilfs_transaction_begin(sb, &ti, 0);
        err = -ENOMEM;
@@ -112,7 +97,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
        if (buffer_uptodate(bh))
                goto failed_bh;
-        bh->b_bdev = nilfs->ns_bdev;
+        bh->b_bdev = sb->s_bdev;
        err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
        if (likely(!err)) {
                get_bh(bh);
@@ -129,7 +114,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
                err = nilfs_transaction_commit(sb);
        else
                nilfs_transaction_abort(sb);
- out:
        return err;
 }
@@ -167,9 +152,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
                unlock_buffer(bh);
                goto failed_bh;
        }
-        bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+        map_bh(bh, inode->i_sb, (sector_t)blknum);
-        bh->b_blocknr = (sector_t)blknum;
-        set_buffer_mapped(bh);
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
@@ -254,8 +237,6 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
 *
 * %-ENOENT - the specified block does not exist (hole block)
 *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
- *
 * %-EROFS - Read only filesystem (for create mode)
 */
 int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
@@ -290,8 +271,6 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
 * %-ENOMEM - Insufficient memory available.
 *
 * %-EIO - I/O error
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
 */
 int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
 {
@@ -367,8 +346,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
 * %-EIO - I/O error
 *
 * %-ENOENT - the specified block does not exist (hole block)
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
 */
 int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
 {
@@ -398,35 +375,24 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
 static int
 nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 {
-        struct inode *inode = container_of(page->mapping,
+        struct inode *inode;
-                                           struct inode, i_data);
+        struct super_block *sb;
-        struct super_block *sb = inode->i_sb;
-        struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
-        struct nilfs_sb_info *writer = NULL;
        int err = 0;
        redirty_page_for_writepage(wbc, page);
        unlock_page(page);
-        if (page->mapping->assoc_mapping)
+        inode = page->mapping->host;
-                return 0; /* Do not request flush for shadow page cache */
+        if (!inode)
-        if (!sb) {
+                return 0;
-                down_read(&nilfs->ns_writer_sem);
-                writer = nilfs->ns_writer;
+        sb = inode->i_sb;
-                if (!writer) {
-                        up_read(&nilfs->ns_writer_sem);
-                        return -EROFS;
-                }
-                sb = writer->s_super;
-        }
        if (wbc->sync_mode == WB_SYNC_ALL)
                err = nilfs_construct_segment(sb);
        else if (wbc->for_reclaim)
                nilfs_flush_segment(sb, inode->i_ino);
-        if (writer)
-                up_read(&nilfs->ns_writer_sem);
        return err;
 }
@@ -439,105 +405,27 @@ static const struct address_space_operations def_mdt_aops = {
 static const struct inode_operations def_mdt_iops;
 static const struct file_operations def_mdt_fops;
-/*
- * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
- * ifile, or gcinodes.  This allows the B-tree code and segment constructor
- * to treat them like regular files, and this helps to simplify the
- * implementation.
- *   On the other hand, some of the pseudo inodes have an irregular point:
- * They don't have valid inode->i_sb pointer because their lifetimes are
- * longer than those of the super block structs; they may continue for
- * several consecutive mounts/umounts.  This would need discussions.
- */
-/**
- * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
- * @nilfs: nilfs object
- * @sb: super block instance the metadata file belongs to
- * @ino: inode number
- * @gfp_mask: gfp mask for data pages
- * @objsz: size of the private object attached to inode->i_private
- */
-struct inode *
-nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
-                     ino_t ino, gfp_t gfp_mask, size_t objsz)
-{
-        struct inode *inode = nilfs_alloc_inode_common(nilfs);
-        if (!inode)
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
-                return NULL;
+{
-        else {
+        struct nilfs_mdt_info *mi;
-                struct address_space * const mapping = &inode->i_data;
-                struct nilfs_mdt_info *mi;
-                mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
-                if (!mi) {
-                        nilfs_destroy_inode(inode);
-                        return NULL;
-                }
-                mi->mi_nilfs = nilfs;
-                init_rwsem(&mi->mi_sem);
-                inode->i_sb = sb; /* sb may be NULL for some meta data files */
-                inode->i_blkbits = nilfs->ns_blocksize_bits;
-                inode->i_flags = 0;
-                atomic_set(&inode->i_count, 1);
-                inode->i_nlink = 1;
-                inode->i_ino = ino;
-                inode->i_mode = S_IFREG;
-                inode->i_private = mi;
-#ifdef INIT_UNUSED_INODE_FIELDS
-                atomic_set(&inode->i_writecount, 0);
-                inode->i_size = 0;
-                inode->i_blocks = 0;
-                inode->i_bytes = 0;
-                inode->i_generation = 0;
-#ifdef CONFIG_QUOTA
-                memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
-#endif
-                inode->i_pipe = NULL;
-                inode->i_bdev = NULL;
-                inode->i_cdev = NULL;
-                inode->i_rdev = 0;
-#ifdef CONFIG_SECURITY
-                inode->i_security = NULL;
-#endif
-                inode->dirtied_when = 0;
-                INIT_LIST_HEAD(&inode->i_list);
-                INIT_LIST_HEAD(&inode->i_sb_list);
-                inode->i_state = 0;
-#endif
-                spin_lock_init(&inode->i_lock);
-                mutex_init(&inode->i_mutex);
-                init_rwsem(&inode->i_alloc_sem);
-                mapping->host = NULL;  /* instead of inode */
-                mapping->flags = 0;
-                mapping_set_gfp_mask(mapping, gfp_mask);
-                mapping->assoc_mapping = NULL;
-                mapping->backing_dev_info = nilfs->ns_bdi;
-                inode->i_mapping = mapping;
-        }
-        return inode;
+        mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
-}
+        if (!mi)
+                return -ENOMEM;
-struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
+        init_rwsem(&mi->mi_sem);
-                            ino_t ino, size_t objsz)
+        inode->i_private = mi;
-{
-        struct inode *inode;
-        inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
+        inode->i_mode = S_IFREG;
-        if (!inode)
+        mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
-                return NULL;
+        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        inode->i_op = &def_mdt_iops;
        inode->i_fop = &def_mdt_fops;
        inode->i_mapping->a_ops = &def_mdt_aops;
-        return inode;
+        return 0;
 }
 void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
@@ -550,34 +438,157 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
        mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
 }
-void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
+static const struct address_space_operations shadow_map_aops = {
+        .sync_page              = block_sync_page,
+};
+/**
+ * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
+ * @inode: inode of the metadata file
+ * @shadow: shadow mapping
+ */
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+                               struct nilfs_shadow_map *shadow)
 {
-        shadow->i_mapping->assoc_mapping = orig->i_mapping;
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
-        NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
+        struct backing_dev_info *bdi = inode->i_sb->s_bdi;
-                &NILFS_I(orig)->i_btnode_cache;
+        INIT_LIST_HEAD(&shadow->frozen_buffers);
+        nilfs_mapping_init_once(&shadow->frozen_data);
+        nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
+        nilfs_mapping_init_once(&shadow->frozen_btnodes);
+        nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
+        mi->mi_shadow = shadow;
+        return 0;
 }
-static void nilfs_mdt_clear(struct inode *inode)
+/**
+ * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
+ * @inode: inode of the metadata file
+ */
+int nilfs_mdt_save_to_shadow_map(struct inode *inode)
 {
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct nilfs_shadow_map *shadow = mi->mi_shadow;
+        int ret;
+        ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
+        if (ret)
+                goto out;
+        ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
+                                     &ii->i_btnode_cache);
+        if (ret)
+                goto out;
+        nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
+ out:
+        return ret;
+}
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
+{
+        struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+        struct buffer_head *bh_frozen;
+        struct page *page;
+        int blkbits = inode->i_blkbits;
+        page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
+        if (!page)
+                return -ENOMEM;
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << blkbits, 0);
+        bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
+        if (!buffer_uptodate(bh_frozen))
+                nilfs_copy_buffer(bh_frozen, bh);
+        if (list_empty(&bh_frozen->b_assoc_buffers)) {
+                list_add_tail(&bh_frozen->b_assoc_buffers,
+                              &shadow->frozen_buffers);
+                set_buffer_nilfs_redirected(bh);
+        } else {
+                brelse(bh_frozen); /* already frozen */
+        }
+        unlock_page(page);
+        page_cache_release(page);
+        return 0;
+}
+struct buffer_head *
+nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
+{
+        struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+        struct buffer_head *bh_frozen = NULL;
+        struct page *page;
+        int n;
-        invalidate_mapping_pages(inode->i_mapping, 0, -1);
+        page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
-        truncate_inode_pages(inode->i_mapping, 0);
+        if (page) {
+                if (page_has_buffers(page)) {
+                        n = bh_offset(bh) >> inode->i_blkbits;
+                        bh_frozen = nilfs_page_get_nth_block(page, n);
+                }
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        return bh_frozen;
+}
+static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
+{
+        struct list_head *head = &shadow->frozen_buffers;
+        struct buffer_head *bh;
-        if (test_bit(NILFS_I_BMAP, &ii->i_state))
+        while (!list_empty(head)) {
-                nilfs_bmap_clear(ii->i_bmap);
+                bh = list_first_entry(head, struct buffer_head,
-        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+                                      b_assoc_buffers);
+                list_del_init(&bh->b_assoc_buffers);
+                brelse(bh); /* drop ref-count to make it releasable */
+        }
 }
-void nilfs_mdt_destroy(struct inode *inode)
+/**
+ * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
 {
-        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct nilfs_shadow_map *shadow = mi->mi_shadow;
-        if (mdi->mi_palloc_cache)
+        down_write(&mi->mi_sem);
-                nilfs_palloc_destroy_cache(inode);
-        nilfs_mdt_clear(inode);
+        if (mi->mi_palloc_cache)
+                nilfs_palloc_clear_cache(inode);
+        nilfs_clear_dirty_pages(inode->i_mapping);
+        nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
+        nilfs_clear_dirty_pages(&ii->i_btnode_cache);
+        nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
+        nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
+        up_write(&mi->mi_sem);
+}
+/**
+ * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_clear_shadow_map(struct inode *inode)
+{
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+        struct nilfs_shadow_map *shadow = mi->mi_shadow;
-        kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+        down_write(&mi->mi_sem);
-        kfree(mdi);
+        nilfs_release_frozen_buffers(shadow);
-        nilfs_destroy_inode(inode);
+        truncate_inode_pages(&shadow->frozen_data, 0);
+        truncate_inode_pages(&shadow->frozen_btnodes, 0);
+        up_write(&mi->mi_sem);
 }
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 6c4bbb0470fc..b13734bf3521 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -28,26 +28,33 @@
 #include "nilfs.h"
 #include "page.h"
+struct nilfs_shadow_map {
+        struct nilfs_bmap_store bmap_store;
+        struct address_space frozen_data;
+        struct address_space frozen_btnodes;
+        struct list_head frozen_buffers;
+};
 /**
 * struct nilfs_mdt_info - on-memory private data of meta data files
- * @mi_nilfs: back pointer to the_nilfs struct
 * @mi_sem: reader/writer semaphore for meta data operations
 * @mi_bgl: per-blockgroup locking
 * @mi_entry_size: size of an entry
 * @mi_first_entry_offset: offset to the first entry
 * @mi_entries_per_block: number of entries in a block
 * @mi_palloc_cache: persistent object allocator cache
+ * @mi_shadow: shadow of bmap and page caches
 * @mi_blocks_per_group: number of blocks in a group
 * @mi_blocks_per_desc_block: number of blocks per descriptor block
 */
 struct nilfs_mdt_info {
-        struct the_nilfs       *mi_nilfs;
        struct rw_semaphore     mi_sem;
        struct blockgroup_lock *mi_bgl;
        unsigned                mi_entry_size;
        unsigned                mi_first_entry_offset;
        unsigned long           mi_entries_per_block;
        struct nilfs_palloc_cache *mi_palloc_cache;
+        struct nilfs_shadow_map *mi_shadow;
        unsigned long           mi_blocks_per_group;
        unsigned long           mi_blocks_per_desc_block;
 };
@@ -59,9 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
 static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
 {
-        struct super_block *sb = inode->i_sb;
+        return NILFS_SB(inode->i_sb)->s_nilfs;
-        return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
 }
 /* Default GFP flags using highmem */
@@ -76,14 +81,17 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
-                            size_t);
-struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
-                                   ino_t, gfp_t, size_t);
-void nilfs_mdt_destroy(struct inode *);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
-void nilfs_mdt_set_shadow(struct inode *, struct inode *);
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+                               struct nilfs_shadow_map *shadow);
+int nilfs_mdt_save_to_shadow_map(struct inode *inode);
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
+void nilfs_mdt_clear_shadow_map(struct inode *inode);
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
+struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
+                                                struct buffer_head *bh);
 #define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
@@ -100,7 +108,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
 static inline __u64 nilfs_mdt_cno(struct inode *inode)
 {
-        return NILFS_MDT(inode)->mi_nilfs->ns_cno;
+        return NILFS_I_NILFS(inode)->ns_cno;
 }
 #define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ad6ed2cf19b4..98034271cd02 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -40,7 +40,11 @@
 #include <linux/pagemap.h>
 #include "nilfs.h"
+#include "export.h"
+#define NILFS_FID_SIZE_NON_CONNECTABLE \
+        (offsetof(struct nilfs_fid, parent_gen) / 4)
+#define NILFS_FID_SIZE_CONNECTABLE      (sizeof(struct nilfs_fid) / 4)
 static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
@@ -70,29 +74,13 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        ino = nilfs_inode_by_name(dir, &dentry->d_name);
        inode = NULL;
        if (ino) {
-                inode = nilfs_iget(dir->i_sb, ino);
+                inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
        }
        return d_splice_alias(inode, dentry);
 }
-struct dentry *nilfs_get_parent(struct dentry *child)
-{
-        unsigned long ino;
-        struct inode *inode;
-        struct qstr dotdot = {.name = "..", .len = 2};
-        ino = nilfs_inode_by_name(child->d_inode, &dotdot);
-        if (!ino)
-                return ERR_PTR(-ENOENT);
-        inode = nilfs_iget(child->d_inode->i_sb, ino);
-        if (IS_ERR(inode))
-                return ERR_CAST(inode);
-        return d_obtain_alias(inode);
-}
 /*
 * By the time this is called, we already have created
 * the directory cache entry for the new file, but it
@@ -219,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = nilfs_add_nondir(dentry, inode);
        if (!err)
@@ -468,6 +456,115 @@ out:
        return err;
 }
+/*
+ * Export operations
+ */
+static struct dentry *nilfs_get_parent(struct dentry *child)
+{
+        unsigned long ino;
+        struct inode *inode;
+        struct qstr dotdot = {.name = "..", .len = 2};
+        struct nilfs_root *root;
+        ino = nilfs_inode_by_name(child->d_inode, &dotdot);
+        if (!ino)
+                return ERR_PTR(-ENOENT);
+        root = NILFS_I(child->d_inode)->i_root;
+        inode = nilfs_iget(child->d_inode->i_sb, root, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        return d_obtain_alias(inode);
+}
+static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
+                                       u64 ino, u32 gen)
+{
+        struct nilfs_root *root;
+        struct inode *inode;
+        if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
+                return ERR_PTR(-ESTALE);
+        root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+        if (!root)
+                return ERR_PTR(-ESTALE);
+        inode = nilfs_iget(sb, root, ino);
+        nilfs_put_root(root);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (gen && inode->i_generation != gen) {
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return d_obtain_alias(inode);
+}
+static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+                                         int fh_len, int fh_type)
+{
+        struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+        if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
+             fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
+            (fh_type != FILEID_NILFS_WITH_PARENT &&
+             fh_type != FILEID_NILFS_WITHOUT_PARENT))
+                return NULL;
+        return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen);
+}
+static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+                                         int fh_len, int fh_type)
+{
+        struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+        if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
+            fh_type != FILEID_NILFS_WITH_PARENT)
+                return NULL;
+        return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
+}
+static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
+                           int connectable)
+{
+        struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+        struct inode *inode = dentry->d_inode;
+        struct nilfs_root *root = NILFS_I(inode)->i_root;
+        int type;
+        if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
+            (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
+                return 255;
+        fid->cno = root->cno;
+        fid->ino = inode->i_ino;
+        fid->gen = inode->i_generation;
+        if (connectable && !S_ISDIR(inode->i_mode)) {
+                struct inode *parent;
+                spin_lock(&dentry->d_lock);
+                parent = dentry->d_parent->d_inode;
+                fid->parent_ino = parent->i_ino;
+                fid->parent_gen = parent->i_generation;
+                spin_unlock(&dentry->d_lock);
+                type = FILEID_NILFS_WITH_PARENT;
+                *lenp = NILFS_FID_SIZE_CONNECTABLE;
+        } else {
+                type = FILEID_NILFS_WITHOUT_PARENT;
+                *lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
+        }
+        return type;
+}
 const struct inode_operations nilfs_dir_inode_operations = {
        .create         = nilfs_create,
        .lookup         = nilfs_lookup,
@@ -480,6 +577,7 @@ const struct inode_operations nilfs_dir_inode_operations = {
        .rename         = nilfs_rename,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
+        .fiemap         = nilfs_fiemap,
 };
 const struct inode_operations nilfs_special_inode_operations = {
@@ -491,4 +589,12 @@ const struct inode_operations nilfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .permission     = nilfs_permission,
+};
+const struct export_operations nilfs_export_ops = {
+        .encode_fh = nilfs_encode_fh,
+        .fh_to_dentry = nilfs_fh_to_dentry,
+        .fh_to_parent = nilfs_fh_to_parent,
+        .get_parent = nilfs_get_parent,
 };
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index d3d54046e5f8..777e8fd04304 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -59,6 +59,7 @@ struct nilfs_inode_info {
 #endif
        struct buffer_head *i_bh;       /* i_bh contains a new or dirty
                                           disk inode */
+        struct nilfs_root *i_root;
        struct inode vfs_inode;
 };
@@ -100,7 +101,6 @@ enum {
        NILFS_I_INODE_DIRTY,            /* write_inode is requested */
        NILFS_I_BMAP,                   /* has bmap and btnode_cache */
        NILFS_I_GCINODE,                /* inode for GC, on memory only */
-        NILFS_I_GCDAT,                  /* shadow DAT, on memory only */
 };
 /*
@@ -190,22 +190,14 @@ static inline int nilfs_doing_construction(void)
        return nilfs_test_transaction_flag(NILFS_TI_WRITER);
 }
-static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
-{
-        return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
-}
 /*
 * function prototype
 */
 #ifdef CONFIG_NILFS_POSIX_ACL
 #error "NILFS: not yet supported POSIX ACL"
-extern int nilfs_permission(struct inode *, int, struct nameidata *);
 extern int nilfs_acl_chmod(struct inode *);
 extern int nilfs_init_acl(struct inode *, struct inode *);
 #else
-#define nilfs_permission   NULL
 static inline int nilfs_acl_chmod(struct inode *inode)
 {
        return 0;
@@ -247,24 +239,28 @@ extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
 extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
-extern struct inode *nilfs_iget(struct super_block *, unsigned long);
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+                            unsigned long ino);
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+                                unsigned long ino);
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+                         unsigned long ino);
+extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
+                                       unsigned long ino, __u64 cno);
 extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
-extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
-                                  struct buffer_head **);
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
-extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
-                                unsigned);
 extern int nilfs_mark_inode_dirty(struct inode *);
 extern void nilfs_dirty_inode(struct inode *);
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-/* namei.c */
+                 __u64 start, __u64 len);
-extern struct dentry *nilfs_get_parent(struct dentry *);
 /* super.c */
-extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
 extern struct inode *nilfs_alloc_inode(struct super_block *);
 extern void nilfs_destroy_inode(struct inode *);
 extern void nilfs_error(struct super_block *, const char *, const char *, ...)
@@ -283,8 +279,9 @@ extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
                                                      int flip);
 extern int nilfs_commit_super(struct nilfs_sb_info *, int);
 extern int nilfs_cleanup_super(struct nilfs_sb_info *);
-extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
-extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
+                            struct nilfs_root **root);
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
 /* gcinode.c */
 int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
@@ -292,16 +289,8 @@ int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
 int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
                                   struct buffer_head **);
 int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
-int nilfs_init_gccache(struct the_nilfs *);
+int nilfs_init_gcinode(struct inode *inode);
-void nilfs_destroy_gccache(struct the_nilfs *);
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
-void nilfs_clear_gcinode(struct inode *);
-struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
-void nilfs_remove_all_gcinode(struct the_nilfs *);
-/* gcdat.c */
-int nilfs_init_gcdat_inode(struct the_nilfs *);
-void nilfs_commit_gcdat_inode(struct the_nilfs *);
-void nilfs_clear_gcdat_inode(struct the_nilfs *);
 /*
 * Inodes and files operations
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index aab11db2cb08..0c432416cfef 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -79,8 +79,8 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 {
        int blkbits = inode->i_blkbits;
        pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
-        struct page *page, *opage;
+        struct page *page;
-        struct buffer_head *bh, *obh;
+        struct buffer_head *bh;
        page = grab_cache_page(mapping, index);
        if (unlikely(!page))
@@ -92,30 +92,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
                page_cache_release(page);
                return NULL;
        }
-        if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
-                /*
-                 * Shadow page cache uses assoc_mapping to point its original
-                 * page cache.  The following code tries the original cache
-                 * if the given cache is a shadow and it didn't hit.
-                 */
-                opage = find_lock_page(mapping->assoc_mapping, index);
-                if (!opage)
-                        return bh;
-                obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
-                                             b_state);
-                if (buffer_uptodate(obh)) {
-                        nilfs_copy_buffer(bh, obh);
-                        if (buffer_dirty(obh)) {
-                                nilfs_mark_buffer_dirty(bh);
-                                if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
-                                        nilfs_mdt_mark_dirty(inode);
-                        }
-                }
-                brelse(obh);
-                unlock_page(opage);
-                page_cache_release(opage);
-        }
        return bh;
 }
@@ -131,6 +107,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
        lock_buffer(bh);
        clear_buffer_nilfs_volatile(bh);
        clear_buffer_nilfs_checked(bh);
+        clear_buffer_nilfs_redirected(bh);
        clear_buffer_dirty(bh);
        if (nilfs_page_buffers_clean(page))
                __nilfs_clear_page_dirty(page);
@@ -483,6 +460,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
                                clear_buffer_dirty(bh);
                                clear_buffer_nilfs_volatile(bh);
                                clear_buffer_nilfs_checked(bh);
+                                clear_buffer_nilfs_redirected(bh);
                                clear_buffer_uptodate(bh);
                                clear_buffer_mapped(bh);
                                unlock_buffer(bh);
@@ -514,6 +492,31 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        return nc;
 }
+void nilfs_mapping_init_once(struct address_space *mapping)
+{
+        memset(mapping, 0, sizeof(*mapping));
+        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+        spin_lock_init(&mapping->tree_lock);
+        INIT_LIST_HEAD(&mapping->private_list);
+        spin_lock_init(&mapping->private_lock);
+        spin_lock_init(&mapping->i_mmap_lock);
+        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+}
+void nilfs_mapping_init(struct address_space *mapping,
+                        struct backing_dev_info *bdi,
+                        const struct address_space_operations *aops)
+{
+        mapping->host = NULL;
+        mapping->flags = 0;
+        mapping_set_gfp_mask(mapping, GFP_NOFS);
+        mapping->assoc_mapping = NULL;
+        mapping->backing_dev_info = bdi;
+        mapping->a_ops = aops;
+}
 /*
 * NILFS2 needs clear_page_dirty() in the following two cases:
 *
@@ -543,3 +546,87 @@ int __nilfs_clear_page_dirty(struct page *page)
        }
        return TestClearPageDirty(page);
 }
+/**
+ * nilfs_find_uncommitted_extent - find extent of uncommitted data
+ * @inode: inode
+ * @start_blk: start block offset (in)
+ * @blkoff: start offset of the found extent (out)
+ *
+ * This function searches an extent of buffers marked "delayed" which
+ * starts from a block offset equal to or larger than @start_blk.  If
+ * such an extent was found, this will store the start offset in
+ * @blkoff and return its length in blocks.  Otherwise, zero is
+ * returned.
+ */
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+                                            sector_t start_blk,
+                                            sector_t *blkoff)
+{
+        unsigned int i;
+        pgoff_t index;
+        unsigned int nblocks_in_page;
+        unsigned long length = 0;
+        sector_t b;
+        struct pagevec pvec;
+        struct page *page;
+        if (inode->i_mapping->nrpages == 0)
+                return 0;
+        index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+repeat:
+        pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
+                                        pvec.pages);
+        if (pvec.nr == 0)
+                return length;
+        if (length > 0 && pvec.pages[0]->index > index)
+                goto out;
+        b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        i = 0;
+        do {
+                page = pvec.pages[i];
+                lock_page(page);
+                if (page_has_buffers(page)) {
+                        struct buffer_head *bh, *head;
+                        bh = head = page_buffers(page);
+                        do {
+                                if (b < start_blk)
+                                        continue;
+                                if (buffer_delay(bh)) {
+                                        if (length == 0)
+                                                *blkoff = b;
+                                        length++;
+                                } else if (length > 0) {
+                                        goto out_locked;
+                                }
+                        } while (++b, bh = bh->b_this_page, bh != head);
+                } else {
+                        if (length > 0)
+                                goto out_locked;
+                        b += nblocks_in_page;
+                }
+                unlock_page(page);
+        } while (++i < pagevec_count(&pvec));
+        index = page->index + 1;
+        pagevec_release(&pvec);
+        cond_resched();
+        goto repeat;
+out_locked:
+        unlock_page(page);
+out:
+        pagevec_release(&pvec);
+        return length;
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index f53d8da41ed7..622df27cd891 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -35,12 +35,14 @@ enum {
        BH_NILFS_Node,
        BH_NILFS_Volatile,
        BH_NILFS_Checked,
+        BH_NILFS_Redirected,
 };
 BUFFER_FNS(NILFS_Allocated, nilfs_allocated)    /* nilfs private buffers */
 BUFFER_FNS(NILFS_Node, nilfs_node)              /* nilfs node buffers */
 BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
 BUFFER_FNS(NILFS_Checked, nilfs_checked)        /* buffer is verified */
+BUFFER_FNS(NILFS_Redirected, nilfs_redirected)  /* redirected to a copy */
 void nilfs_mark_buffer_dirty(struct buffer_head *bh);
@@ -59,7 +61,14 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
+void nilfs_mapping_init_once(struct address_space *mapping);
+void nilfs_mapping_init(struct address_space *mapping,
+                        struct backing_dev_info *bdi,
+                        const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+                                            sector_t start_blk,
+                                            sector_t *blkoff);
 #define NILFS_PAGE_BUG(page, m, a...) \
        do { nilfs_page_bug(page); BUG(); } while (0)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d0c35ef39f6a..3dfcd3b7d389 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -440,7 +440,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
        segnum[2] = ri->ri_segnum;
        segnum[3] = ri->ri_nextnum;
-        nilfs_attach_writer(nilfs, sbi);
        /*
         * Releasing the next segment of the latest super root.
         * The next segment is invalidated by this recovery.
@@ -480,7 +479,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 failed:
        /* No need to recover sufile because it will be destroyed on error */
-        nilfs_detach_writer(nilfs, sbi);
        return err;
 }
@@ -504,6 +502,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
                                      struct nilfs_sb_info *sbi,
+                                      struct nilfs_root *root,
                                      struct list_head *head,
                                      unsigned long *nr_salvaged_blocks)
 {
@@ -515,7 +514,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
        int err = 0, err2 = 0;
        list_for_each_entry_safe(rb, n, head, list) {
-                inode = nilfs_iget(sbi->s_super, rb->ino);
+                inode = nilfs_iget(sbi->s_super, root, rb->ino);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        inode = NULL;
@@ -536,7 +535,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
                if (unlikely(err))
                        goto failed_page;
-                err = nilfs_set_file_dirty(sbi, inode, 1);
+                err = nilfs_set_file_dirty(inode, 1);
                if (unlikely(err))
                        goto failed_page;
@@ -578,6 +577,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 */
 static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                                 struct nilfs_sb_info *sbi,
+                                 struct nilfs_root *root,
                                 struct nilfs_recovery_info *ri)
 {
        struct buffer_head *bh_sum = NULL;
@@ -597,7 +597,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
        };
        int state = RF_INIT_ST;
-        nilfs_attach_writer(nilfs, sbi);
        pseg_start = ri->ri_lsegs_start;
        seg_seq = ri->ri_lsegs_start_seq;
        segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
@@ -649,7 +648,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                                goto failed;
                        if (flags & NILFS_SS_LOGEND) {
                                err = nilfs_recover_dsync_blocks(
-                                        nilfs, sbi, &dsync_blocks,
+                                        nilfs, sbi, root, &dsync_blocks,
                                        &nsalvaged_blocks);
                                if (unlikely(err))
                                        goto failed;
@@ -688,7 +687,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 out:
        brelse(bh_sum);
        dispose_recovery_list(&dsync_blocks);
-        nilfs_detach_writer(nilfs, sbi);
        return err;
 confused:
@@ -746,19 +744,20 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
                              struct nilfs_sb_info *sbi,
                              struct nilfs_recovery_info *ri)
 {
+        struct nilfs_root *root;
        int err;
        if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
                return 0;
-        err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
+        err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
        if (unlikely(err)) {
                printk(KERN_ERR
                       "NILFS: error loading the latest checkpoint.\n");
                return err;
        }
-        err = nilfs_do_roll_forward(nilfs, sbi, ri);
+        err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
        if (unlikely(err))
                goto failed;
@@ -770,7 +769,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
                        goto failed;
                }
-                err = nilfs_attach_segment_constructor(sbi);
+                err = nilfs_attach_segment_constructor(sbi, root);
                if (unlikely(err))
                        goto failed;
@@ -788,7 +787,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
        }
 failed:
-        nilfs_detach_checkpoint(sbi);
+        nilfs_put_root(root);
        return err;
 }
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 0776ccc2504a..7a17715f215f 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -27,14 +27,6 @@
 #include <linux/types.h>
 #include <linux/fs.h>
-/*
- * Mount options
- */
-struct nilfs_mount_options {
-        unsigned long mount_opt;
-        __u64 snapshot_cno;
-};
 struct the_nilfs;
 struct nilfs_sc_info;
@@ -42,11 +34,6 @@ struct nilfs_sc_info;
 * NILFS super-block data in memory
 */
 struct nilfs_sb_info {
-        /* Snapshot status */
-        __u64 s_snapshot_cno;           /* Checkpoint number */
-        atomic_t s_inodes_count;
-        atomic_t s_blocks_count;        /* Reserved (might be deleted) */
        /* Mount options */
        unsigned long s_mount_opt;
        uid_t s_resuid;
@@ -59,8 +46,6 @@ struct nilfs_sb_info {
        /* Fundamental members */
        struct super_block *s_super;    /* reverse pointer to super_block */
        struct the_nilfs *s_nilfs;
-        struct list_head s_list;        /* list head for nilfs->ns_supers */
-        atomic_t s_count;               /* reference count */
        /* Segment constructor */
        struct list_head s_dirty_files; /* dirty files list */
@@ -68,9 +53,6 @@ struct nilfs_sb_info {
        spinlock_t s_inode_lock;        /* Lock for the nilfs inode.
                                           It covers s_dirty_files list */
-        /* Metadata files */
-        struct inode *s_ifile;          /* index file inode */
        /* Inode allocator */
        spinlock_t s_next_gen_lock;
        u32 s_next_generation;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 4588fb9e93df..0f83e93935b2 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -371,7 +371,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
        struct bio *bio = wi->bio;
        int err;
-        if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) {
+        if (segbuf->sb_nbio > 0 &&
+            bdi_write_congested(segbuf->sb_super->s_bdi)) {
                wait_for_completion(&segbuf->sb_bio_event);
                segbuf->sb_nbio--;
                if (unlikely(atomic_read(&segbuf->sb_err))) {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9fd051a33c4f..55ebae5c7f39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -191,6 +191,8 @@ int nilfs_transaction_begin(struct super_block *sb,
        if (ret > 0)
                return 0;
+        vfs_check_frozen(sb, SB_FREEZE_WRITE);
        sbi = NILFS_SB(sb);
        nilfs = sbi->s_nilfs;
        down_read(&nilfs->ns_segctor_sem);
@@ -366,8 +368,7 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
        if (nilfs_doing_gc())
                flags = NILFS_SS_GC;
-        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
+        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
-                                 sci->sc_sbi->s_nilfs->ns_cno);
        if (unlikely(err))
                return err;
@@ -440,17 +441,26 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
        struct nilfs_finfo *finfo;
        struct nilfs_inode_info *ii;
        struct nilfs_segment_buffer *segbuf;
+        __u64 cno;
        if (sci->sc_blk_cnt == 0)
                return;
        ii = NILFS_I(inode);
+        if (test_bit(NILFS_I_GCINODE, &ii->i_state))
+                cno = ii->i_cno;
+        else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
+                cno = 0;
+        else
+                cno = sci->sc_cno;
        finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
                                                 sizeof(*finfo));
        finfo->fi_ino = cpu_to_le64(inode->i_ino);
        finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
        finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
-        finfo->fi_cno = cpu_to_le64(ii->i_cno);
+        finfo->fi_cno = cpu_to_le64(cno);
        segbuf = sci->sc_curseg;
        segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
@@ -494,17 +504,6 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
        return err;
 }
-static int nilfs_handle_bmap_error(int err, const char *fname,
-                                   struct inode *inode, struct super_block *sb)
-{
-        if (err == -EINVAL) {
-                nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
-                            inode->i_ino);
-                err = -EIO;
-        }
-        return err;
-}
 /*
 * Callback functions that enumerate, mark, and collect dirty blocks
 */
@@ -514,9 +513,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
        int err;
        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
+        if (err < 0)
-                return nilfs_handle_bmap_error(err, __func__, inode,
+                return err;
-                                               sci->sc_super);
        err = nilfs_segctor_add_file_block(sci, bh, inode,
                                           sizeof(struct nilfs_binfo_v));
@@ -529,13 +527,7 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
                                   struct buffer_head *bh,
                                   struct inode *inode)
 {
-        int err;
+        return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
-                return nilfs_handle_bmap_error(err, __func__, inode,
-                                               sci->sc_super);
-        return 0;
 }
 static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
@@ -578,9 +570,8 @@ static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
        int err;
        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
+        if (err < 0)
-                return nilfs_handle_bmap_error(err, __func__, inode,
+                return err;
-                                               sci->sc_super);
        err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
        if (!err)
@@ -755,20 +746,19 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
        }
 }
-static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
+static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
+                                     struct nilfs_root *root)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        int ret = 0;
-        if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
+        if (nilfs_mdt_fetch_dirty(root->ifile))
                ret++;
        if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
                ret++;
        if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
                ret++;
-        if (ret || nilfs_doing_gc())
+        if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
-                if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
+                ret++;
-                        ret++;
        return ret;
 }
@@ -785,7 +775,7 @@ static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        int ret = 0;
-        if (nilfs_test_metadata_dirty(sbi))
+        if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
        spin_lock(&sbi->s_inode_lock);
@@ -801,10 +791,10 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        nilfs_mdt_clear_dirty(sbi->s_ifile);
+        nilfs_mdt_clear_dirty(sci->sc_root->ifile);
        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
        nilfs_mdt_clear_dirty(nilfs->ns_sufile);
-        nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
+        nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
@@ -848,9 +838,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
        raw_cp->cp_snapshot_list.ssl_next = 0;
        raw_cp->cp_snapshot_list.ssl_prev = 0;
        raw_cp->cp_inodes_count =
-                cpu_to_le64(atomic_read(&sbi->s_inodes_count));
+                cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
        raw_cp->cp_blocks_count =
-                cpu_to_le64(atomic_read(&sbi->s_blocks_count));
+                cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
        raw_cp->cp_nblk_inc =
                cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
        raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
@@ -861,7 +851,8 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
        else
                nilfs_checkpoint_set_minor(raw_cp);
-        nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
+        nilfs_write_inode_common(sci->sc_root->ifile,
+                                 &raw_cp->cp_ifile_inode, 1);
        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
        return 0;
@@ -886,13 +877,12 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
        }
 }
-static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
+static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
-                                            struct inode *ifile)
 {
        struct nilfs_inode_info *ii;
        list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
-                nilfs_fill_in_file_bmap(ifile, ii);
+                nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
                set_bit(NILFS_I_COLLECTED, &ii->i_state);
        }
 }
@@ -913,7 +903,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
        raw_sr->sr_flags = 0;
-        nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
+        nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
                                 NILFS_SR_DAT_OFFSET(isz), 1);
        nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
                                 NILFS_SR_CPFILE_OFFSET(isz), 1);
@@ -1135,7 +1125,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
                /* Fall through */
        case NILFS_ST_IFILE:
-                err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
+                err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
                        break;
@@ -1169,7 +1159,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                sci->sc_stage.scnt++;  /* Fall through */
        case NILFS_ST_DAT:
 dat_stage:
-                err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
+                err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
                                              &nilfs_sc_dat_ops);
                if (unlikely(err))
                        break;
@@ -1553,7 +1543,6 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
        return 0;
 failed_bmap:
-        err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
        return err;
 }
@@ -1599,7 +1588,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
        kunmap_atomic(kaddr, KM_USER0);
        if (!TestSetPageWriteback(clone_page))
-                inc_zone_page_state(clone_page, NR_WRITEBACK);
+                account_page_writeback(clone_page);
        unlock_page(clone_page);
        return 0;
@@ -1773,6 +1762,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
                                if (!err) {
                                        set_buffer_uptodate(bh);
                                        clear_buffer_dirty(bh);
+                                        clear_buffer_delay(bh);
                                        clear_buffer_nilfs_volatile(bh);
                                }
                                brelse(bh); /* for b_assoc_buffers */
@@ -1899,7 +1889,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                                    b_assoc_buffers) {
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
+                        clear_buffer_delay(bh);
                        clear_buffer_nilfs_volatile(bh);
+                        clear_buffer_nilfs_redirected(bh);
                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
@@ -1936,11 +1928,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        nilfs_drop_collected_inodes(&sci->sc_dirty_files);
-        if (nilfs_doing_gc()) {
+        if (nilfs_doing_gc())
                nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
-                if (update_sr)
+        else
-                        nilfs_commit_gcdat_inode(nilfs);
-        } else
                nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
        sci->sc_nblk_inc += sci->sc_nblk_this_inc;
@@ -1976,7 +1966,7 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
                                        struct nilfs_sb_info *sbi)
 {
        struct nilfs_inode_info *ii, *n;
-        __u64 cno = sbi->s_nilfs->ns_cno;
+        struct inode *ifile = sci->sc_root->ifile;
        spin_lock(&sbi->s_inode_lock);
 retry:
@@ -1987,14 +1977,14 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
                        spin_unlock(&sbi->s_inode_lock);
                        err = nilfs_ifile_get_inode_block(
-                                sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
+                                ifile, ii->vfs_inode.i_ino, &ibh);
                        if (unlikely(err)) {
                                nilfs_warning(sbi->s_super, __func__,
                                              "failed to get inode block.\n");
                                return err;
                        }
                        nilfs_mdt_mark_buffer_dirty(ibh);
-                        nilfs_mdt_mark_dirty(sbi->s_ifile);
+                        nilfs_mdt_mark_dirty(ifile);
                        spin_lock(&sbi->s_inode_lock);
                        if (likely(!ii->i_bh))
                                ii->i_bh = ibh;
@@ -2002,7 +1992,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
                                brelse(ibh);
                        goto retry;
                }
-                ii->i_cno = cno;
                clear_bit(NILFS_I_QUEUED, &ii->i_state);
                set_bit(NILFS_I_BUSY, &ii->i_state);
@@ -2011,8 +2000,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
        }
        spin_unlock(&sbi->s_inode_lock);
-        NILFS_I(sbi->s_ifile)->i_cno = cno;
        return 0;
 }
@@ -2021,19 +2008,13 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
 {
        struct nilfs_transaction_info *ti = current->journal_info;
        struct nilfs_inode_info *ii, *n;
-        __u64 cno = sbi->s_nilfs->ns_cno;
        spin_lock(&sbi->s_inode_lock);
        list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
                if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
-                    test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+                    test_bit(NILFS_I_DIRTY, &ii->i_state))
-                        /* The current checkpoint number (=nilfs->ns_cno) is
-                           changed between check-in and check-out only if the
-                           super root is written out.  So, we can update i_cno
-                           for the inodes that remain in the dirty list. */
-                        ii->i_cno = cno;
                        continue;
-                }
                clear_bit(NILFS_I_BUSY, &ii->i_state);
                brelse(ii->i_bh);
                ii->i_bh = NULL;
@@ -2054,12 +2035,13 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
        int err;
        sci->sc_stage.scnt = NILFS_ST_INIT;
+        sci->sc_cno = nilfs->ns_cno;
        err = nilfs_segctor_check_in_files(sci, sbi);
        if (unlikely(err))
                goto out;
-        if (nilfs_test_metadata_dirty(sbi))
+        if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
        if (nilfs_segctor_clean(sci))
@@ -2091,7 +2073,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                        goto failed;
                if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
-                        nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
+                        nilfs_segctor_fill_in_file_bmap(sci);
                if (mode == SC_LSEG_SR &&
                    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
@@ -2452,9 +2434,8 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
        list_for_each_entry_safe(ii, n, head, i_dirty) {
                if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
                        continue;
-                hlist_del_init(&ii->vfs_inode.i_hash);
                list_del_init(&ii->i_dirty);
-                nilfs_clear_gcinode(&ii->vfs_inode);
+                iput(&ii->vfs_inode);
        }
 }
@@ -2472,13 +2453,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        nilfs_transaction_lock(sbi, &ti, 1);
-        err = nilfs_init_gcdat_inode(nilfs);
+        err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
        if (unlikely(err))
                goto out_unlock;
        err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
-        if (unlikely(err))
+        if (unlikely(err)) {
+                nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
                goto out_unlock;
+        }
        sci->sc_freesegs = kbufs[4];
        sci->sc_nfreesegs = argv[4].v_nmembs;
@@ -2510,7 +2493,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 out_unlock:
        sci->sc_freesegs = NULL;
        sci->sc_nfreesegs = 0;
-        nilfs_clear_gcdat_inode(nilfs);
+        nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
        nilfs_transaction_unlock(sbi);
        return err;
 }
@@ -2672,6 +2655,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
 }
 static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
+        __acquires(&sci->sc_state_lock)
+        __releases(&sci->sc_state_lock)
 {
        sci->sc_state |= NILFS_SEGCTOR_QUIT;
@@ -2686,7 +2671,8 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
 /*
 * Setup & clean-up functions
 */
-static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
+static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
+                                               struct nilfs_root *root)
 {
        struct nilfs_sc_info *sci;
@@ -2697,6 +2683,9 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        sci->sc_sbi = sbi;
        sci->sc_super = sbi->s_super;
+        nilfs_get_root(root);
+        sci->sc_root = root;
        init_waitqueue_head(&sci->sc_wait_request);
        init_waitqueue_head(&sci->sc_wait_daemon);
        init_waitqueue_head(&sci->sc_wait_task);
@@ -2771,6 +2760,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        WARN_ON(!list_empty(&sci->sc_segbufs));
        WARN_ON(!list_empty(&sci->sc_write_logs));
+        nilfs_put_root(sci->sc_root);
        down_write(&sbi->s_nilfs->ns_segctor_sem);
        del_timer_sync(&sci->sc_timer);
@@ -2780,6 +2771,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 /**
 * nilfs_attach_segment_constructor - attach a segment constructor
 * @sbi: nilfs_sb_info
+ * @root: root object of the current filesystem tree
 *
 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
 * initializes it, and starts the segment constructor.
@@ -2789,9 +2781,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 *
 * %-ENOMEM - Insufficient memory available.
 */
-int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+                                     struct nilfs_root *root)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
        if (NILFS_SC(sbi)) {
@@ -2803,14 +2795,12 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
                nilfs_detach_segment_constructor(sbi);
        }
-        sbi->s_sc_info = nilfs_segctor_new(sbi);
+        sbi->s_sc_info = nilfs_segctor_new(sbi, root);
        if (!sbi->s_sc_info)
                return -ENOMEM;
-        nilfs_attach_writer(nilfs, sbi);
        err = nilfs_segctor_start_thread(NILFS_SC(sbi));
        if (err) {
-                nilfs_detach_writer(nilfs, sbi);
                kfree(sbi->s_sc_info);
                sbi->s_sc_info = NULL;
        }
@@ -2847,5 +2837,4 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
        up_write(&nilfs->ns_segctor_sem);
        nilfs_dispose_list(sbi, &garbage_list, 1);
-        nilfs_detach_writer(nilfs, sbi);
 }
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 17c487bd8152..cd8056e7cbed 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -29,6 +29,8 @@
 #include <linux/nilfs2_fs.h>
 #include "sb.h"
+struct nilfs_root;
 /**
 * struct nilfs_recovery_info - Recovery information
 * @ri_need_recovery: Recovery status
@@ -87,6 +89,7 @@ struct nilfs_segsum_pointer {
 * struct nilfs_sc_info - Segment constructor information
 * @sc_super: Back pointer to super_block struct
 * @sc_sbi: Back pointer to nilfs_sb_info struct
+ * @sc_root: root object of the current filesystem tree
 * @sc_nblk_inc: Block count of current generation
 * @sc_dirty_files: List of files to be written
 * @sc_gc_inodes: List of GC inodes having blocks to be written
@@ -107,6 +110,7 @@ struct nilfs_segsum_pointer {
 * @sc_datablk_cnt: Data block count of a file
 * @sc_nblk_this_inc: Number of blocks included in the current logical segment
 * @sc_seg_ctime: Creation time
+ * @sc_cno: checkpoint number of current log
 * @sc_flags: Internal flags
 * @sc_state_lock: spinlock for sc_state and so on
 * @sc_state: Segctord state flags
@@ -128,6 +132,7 @@ struct nilfs_segsum_pointer {
 struct nilfs_sc_info {
        struct super_block     *sc_super;
        struct nilfs_sb_info   *sc_sbi;
+        struct nilfs_root      *sc_root;
        unsigned long           sc_nblk_inc;
@@ -156,7 +161,7 @@ struct nilfs_sc_info {
        unsigned long           sc_datablk_cnt;
        unsigned long           sc_nblk_this_inc;
        time_t                  sc_seg_ctime;
+        __u64                   sc_cno;
        unsigned long           sc_flags;
        spinlock_t              sc_state_lock;
@@ -230,7 +235,8 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
                                void **);
-extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+                                     struct nilfs_root *root);
 extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
 /* recovery.c */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3c6cc6005c2e..1d6f488ccae8 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -505,7 +505,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 {
        struct buffer_head *header_bh;
        struct nilfs_sufile_header *header;
-        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
        void *kaddr;
        int ret;
@@ -583,7 +583,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
        struct nilfs_segment_usage *su;
        struct nilfs_suinfo *si = buf;
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
-        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
        void *kaddr;
        unsigned long nsegs, segusages_per_block;
        ssize_t n;
@@ -635,46 +635,55 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 }
 /**
- * nilfs_sufile_read - read sufile inode
+ * nilfs_sufile_read - read or get sufile inode
- * @sufile: sufile inode
+ * @sb: super block instance
+ * @susize: size of a segment usage entry
 * @raw_inode: on-disk sufile inode
+ * @inodep: buffer to store the inode
 */
-int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode)
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
+                      struct nilfs_inode *raw_inode, struct inode **inodep)
 {
-        struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+        struct inode *sufile;
+        struct nilfs_sufile_info *sui;
        struct buffer_head *header_bh;
        struct nilfs_sufile_header *header;
        void *kaddr;
-        int ret;
+        int err;
-        ret = nilfs_read_inode_common(sufile, raw_inode);
+        sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
-        if (ret < 0)
+        if (unlikely(!sufile))
-                return ret;
+                return -ENOMEM;
+        if (!(sufile->i_state & I_NEW))
+                goto out;
-        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
-        if (!ret) {
+        if (err)
-                kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+                goto failed;
-                header = kaddr + bh_offset(header_bh);
-                sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
-                kunmap_atomic(kaddr, KM_USER0);
-                brelse(header_bh);
-        }
-        return ret;
-}
-/**
+        nilfs_mdt_set_entry_size(sufile, susize,
- * nilfs_sufile_new - create sufile
+                                 sizeof(struct nilfs_sufile_header));
- * @nilfs: nilfs object
- * @susize: size of a segment usage entry
+        err = nilfs_read_inode_common(sufile, raw_inode);
- */
+        if (err)
-struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize)
+                goto failed;
-{
-        struct inode *sufile;
+        err = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (err)
+                goto failed;
-        sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO,
+        sui = NILFS_SUI(sufile);
-                               sizeof(struct nilfs_sufile_info));
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-        if (sufile)
+        header = kaddr + bh_offset(header_bh);
-                nilfs_mdt_set_entry_size(sufile, susize,
+        sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
-                                         sizeof(struct nilfs_sufile_header));
+        kunmap_atomic(kaddr, KM_USER0);
-        return sufile;
+        brelse(header_bh);
+        unlock_new_inode(sufile);
+ out:
+        *inodep = sufile;
+        return 0;
+ failed:
+        iget_failed(sufile);
+        return err;
 }
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 15163b8aff7d..a943fbacb45b 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,7 +31,7 @@
 static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 {
-        return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
+        return NILFS_I_NILFS(sufile)->ns_nsegments;
 }
 unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
@@ -61,8 +61,8 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
                               struct buffer_head *);
-int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode);
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
-struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize);
+                      struct nilfs_inode *raw_inode, struct inode **inodep);
 /**
 * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 922263393c76..58fd707174e1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -45,14 +45,12 @@
 #include <linux/parser.h>
 #include <linux/random.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
-#include <linux/kobject.h>
-#include <linux/exportfs.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "nilfs.h"
+#include "export.h"
 #include "mdt.h"
 #include "alloc.h"
 #include "btree.h"
@@ -69,11 +67,12 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
-struct kmem_cache *nilfs_inode_cachep;
+static struct kmem_cache *nilfs_inode_cachep;
 struct kmem_cache *nilfs_transaction_cachep;
 struct kmem_cache *nilfs_segbuf_cachep;
 struct kmem_cache *nilfs_btree_path_cache;
+static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 static void nilfs_set_error(struct nilfs_sb_info *sbi)
@@ -111,12 +110,17 @@ void nilfs_error(struct super_block *sb, const char *function,
                 const char *fmt, ...)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -136,18 +140,22 @@ void nilfs_error(struct super_block *sb, const char *function,
 void nilfs_warning(struct super_block *sb, const char *function,
                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "NILFS warning (device %s): %s: ",
-               sb->s_id, function);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
 }
-struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
+struct inode *nilfs_alloc_inode(struct super_block *sb)
 {
        struct nilfs_inode_info *ii;
@@ -156,19 +164,29 @@ struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
                return NULL;
        ii->i_bh = NULL;
        ii->i_state = 0;
+        ii->i_cno = 0;
        ii->vfs_inode.i_version = 1;
-        nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi);
+        nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
        return &ii->vfs_inode;
 }
-struct inode *nilfs_alloc_inode(struct super_block *sb)
+static void nilfs_i_callback(struct rcu_head *head)
 {
-        return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        if (mdi) {
+                kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+                kfree(mdi);
+        }
+        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
 void nilfs_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
+        call_rcu(&inode->i_rcu, nilfs_i_callback);
 }
 static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
@@ -178,17 +196,9 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 retry:
        set_buffer_dirty(nilfs->ns_sbh[0]);
        if (nilfs_test_opt(sbi, BARRIER)) {
                err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-                                          WRITE_SYNC | WRITE_BARRIER);
+                                          WRITE_SYNC | WRITE_FLUSH_FUA);
-                if (err == -EOPNOTSUPP) {
-                        nilfs_warning(sbi->s_super, __func__,
-                                      "barrier-based sync failed. "
-                                      "disabling barriers\n");
-                        nilfs_clear_opt(sbi, BARRIER);
-                        goto retry;
-                }
        } else {
                err = sync_dirty_buffer(nilfs->ns_sbh[0]);
        }
@@ -342,8 +352,6 @@ static void nilfs_put_super(struct super_block *sb)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        lock_kernel();
        nilfs_detach_segment_constructor(sbi);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -351,18 +359,15 @@ static void nilfs_put_super(struct super_block *sb)
                nilfs_cleanup_super(sbi);
                up_write(&nilfs->ns_sem);
        }
-        down_write(&nilfs->ns_super_sem);
-        if (nilfs->ns_current == sbi)
-                nilfs->ns_current = NULL;
-        up_write(&nilfs->ns_super_sem);
-        nilfs_detach_checkpoint(sbi);
+        iput(nilfs->ns_sufile);
-        put_nilfs(sbi->s_nilfs);
+        iput(nilfs->ns_cpfile);
+        iput(nilfs->ns_dat);
+        destroy_nilfs(nilfs);
        sbi->s_super = NULL;
        sb->s_fs_info = NULL;
-        nilfs_put_sbinfo(sbi);
+        kfree(sbi);
-        unlock_kernel();
 }
 static int nilfs_sync_fs(struct super_block *sb, int wait)
@@ -389,21 +394,22 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
        return err;
 }
-int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+                            struct nilfs_root **rootp)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_root *root;
        struct nilfs_checkpoint *raw_cp;
        struct buffer_head *bh_cp;
-        int err;
+        int err = -ENOMEM;
-        down_write(&nilfs->ns_super_sem);
+        root = nilfs_find_or_create_root(
-        list_add(&sbi->s_list, &nilfs->ns_supers);
+                nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
-        up_write(&nilfs->ns_super_sem);
+        if (!root)
+                return err;
-        err = -ENOMEM;
+        if (root->ifile)
-        sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
+                goto reuse; /* already attached checkpoint */
-        if (!sbi->s_ifile)
-                goto delist;
        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -419,45 +425,64 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
                }
                goto failed;
        }
-        err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
-        if (unlikely(err))
+        err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
+                               &raw_cp->cp_ifile_inode, &root->ifile);
+        if (err)
                goto failed_bh;
-        atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
-        atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+        atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
+        atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+ reuse:
+        *rootp = root;
        return 0;
 failed_bh:
        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
 failed:
-        nilfs_mdt_destroy(sbi->s_ifile);
+        nilfs_put_root(root);
-        sbi->s_ifile = NULL;
- delist:
+        return err;
-        down_write(&nilfs->ns_super_sem);
+}
-        list_del_init(&sbi->s_list);
-        up_write(&nilfs->ns_super_sem);
+static int nilfs_freeze(struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err;
+        if (sb->s_flags & MS_RDONLY)
+                return 0;
+        /* Mark super block clean */
+        down_write(&nilfs->ns_sem);
+        err = nilfs_cleanup_super(sbi);
+        up_write(&nilfs->ns_sem);
        return err;
 }
-void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
+static int nilfs_unfreeze(struct super_block *sb)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        nilfs_mdt_destroy(sbi->s_ifile);
+        if (sb->s_flags & MS_RDONLY)
-        sbi->s_ifile = NULL;
+                return 0;
-        down_write(&nilfs->ns_super_sem);
-        list_del_init(&sbi->s_list);
+        down_write(&nilfs->ns_sem);
-        up_write(&nilfs->ns_super_sem);
+        nilfs_setup_super(sbi, false);
+        up_write(&nilfs->ns_sem);
+        return 0;
 }
 static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = root->nilfs;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        unsigned long long blocks;
        unsigned long overhead;
@@ -493,7 +518,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bfree = nfreeblocks;
        buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
                (buf->f_bfree - nrsvblocks) : 0;
-        buf->f_files = atomic_read(&sbi->s_inodes_count);
+        buf->f_files = atomic_read(&root->inodes_count);
        buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
        buf->f_namelen = NILFS_NAME_LEN;
        buf->f_fsid.val[0] = (u32)id;
@@ -506,12 +531,12 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        struct super_block *sb = vfs->mnt_sb;
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
        if (!nilfs_test_opt(sbi, BARRIER))
                seq_puts(seq, ",nobarrier");
-        if (nilfs_test_opt(sbi, SNAPSHOT))
+        if (root->cno != NILFS_CPTREE_CURRENT_CNO)
-                seq_printf(seq, ",cp=%llu",
+                seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
-                           (unsigned long long int)sbi->s_snapshot_cno);
        if (nilfs_test_opt(sbi, ERRORS_PANIC))
                seq_puts(seq, ",errors=panic");
        if (nilfs_test_opt(sbi, ERRORS_CONT))
@@ -537,6 +562,8 @@ static const struct super_operations nilfs_sops = {
        .put_super      = nilfs_put_super,
        /* .write_super    = nilfs_write_super, */
        .sync_fs        = nilfs_sync_fs,
+        .freeze_fs      = nilfs_freeze,
+        .unfreeze_fs    = nilfs_unfreeze,
        /* .write_super_lockfs */
        /* .unlockfs */
        .statfs         = nilfs_statfs,
@@ -545,48 +572,6 @@ static const struct super_operations nilfs_sops = {
        .show_options = nilfs_show_options
 };
-static struct inode *
-nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
-{
-        struct inode *inode;
-        if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
-            ino != NILFS_SKETCH_INO)
-                return ERR_PTR(-ESTALE);
-        inode = nilfs_iget(sb, ino);
-        if (IS_ERR(inode))
-                return ERR_CAST(inode);
-        if (generation && inode->i_generation != generation) {
-                iput(inode);
-                return ERR_PTR(-ESTALE);
-        }
-        return inode;
-}
-static struct dentry *
-nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
-                   int fh_type)
-{
-        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-                                    nilfs_nfs_get_inode);
-}
-static struct dentry *
-nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
-                   int fh_type)
-{
-        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-                                    nilfs_nfs_get_inode);
-}
-static const struct export_operations nilfs_export_ops = {
-        .fh_to_dentry = nilfs_fh_to_dentry,
-        .fh_to_parent = nilfs_fh_to_parent,
-        .get_parent = nilfs_get_parent,
-};
 enum {
        Opt_err_cont, Opt_err_panic, Opt_err_ro,
        Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
@@ -612,7 +597,6 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        char *p;
        substring_t args[MAX_OPT_ARGS];
-        int option;
        if (!options)
                return 1;
@@ -650,30 +634,12 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
                        break;
                case Opt_snapshot:
-                        if (match_int(&args[0], &option) || option <= 0)
-                                return 0;
                        if (is_remount) {
-                                if (!nilfs_test_opt(sbi, SNAPSHOT)) {
+                                printk(KERN_ERR
-                                        printk(KERN_ERR
+                                       "NILFS: \"%s\" option is invalid "
-                                               "NILFS: cannot change regular "
+                                       "for remount.\n", p);
-                                               "mount to snapshot.\n");
-                                        return 0;
-                                } else if (option != sbi->s_snapshot_cno) {
-                                        printk(KERN_ERR
-                                               "NILFS: cannot remount to a "
-                                               "different snapshot.\n");
-                                        return 0;
-                                }
-                                break;
-                        }
-                        if (!(sb->s_flags & MS_RDONLY)) {
-                                printk(KERN_ERR "NILFS: cannot mount snapshot "
-                                       "read/write.  A read-only option is "
-                                       "required.\n");
                                return 0;
                        }
-                        sbi->s_snapshot_cno = option;
-                        nilfs_set_opt(sbi, SNAPSHOT);
                        break;
                case Opt_norecovery:
                        nilfs_set_opt(sbi, NORECOVERY);
@@ -701,7 +667,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
                NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
-static int nilfs_setup_super(struct nilfs_sb_info *sbi)
+static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_super_block **sbp;
@@ -713,6 +679,9 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
        if (!sbp)
                return -EIO;
+        if (!is_mount)
+                goto skip_mount_setup;
        max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
        mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
@@ -729,11 +698,14 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
                sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
        sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
+        sbp[0]->s_mtime = cpu_to_le64(get_seconds());
+skip_mount_setup:
        sbp[0]->s_state =
                cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
-        sbp[0]->s_mtime = cpu_to_le64(get_seconds());
        /* synchronize sbp[1] with sbp[0] */
-        memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+        if (sbp[1])
+                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
        return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
 }
@@ -798,22 +770,156 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
        return 0;
 }
+static int nilfs_get_root_dentry(struct super_block *sb,
+                                 struct nilfs_root *root,
+                                 struct dentry **root_dentry)
+{
+        struct inode *inode;
+        struct dentry *dentry;
+        int ret = 0;
+        inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
+        if (IS_ERR(inode)) {
+                printk(KERN_ERR "NILFS: get root inode failed\n");
+                ret = PTR_ERR(inode);
+                goto out;
+        }
+        if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
+                iput(inode);
+                printk(KERN_ERR "NILFS: corrupt root inode.\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
+                dentry = d_find_alias(inode);
+                if (!dentry) {
+                        dentry = d_alloc_root(inode);
+                        if (!dentry) {
+                                iput(inode);
+                                ret = -ENOMEM;
+                                goto failed_dentry;
+                        }
+                } else {
+                        iput(inode);
+                }
+        } else {
+                dentry = d_obtain_alias(inode);
+                if (IS_ERR(dentry)) {
+                        ret = PTR_ERR(dentry);
+                        goto failed_dentry;
+                }
+        }
+        *root_dentry = dentry;
+ out:
+        return ret;
+ failed_dentry:
+        printk(KERN_ERR "NILFS: get root dentry failed\n");
+        goto out;
+}
+static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
+                                 struct dentry **root_dentry)
+{
+        struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
+        struct nilfs_root *root;
+        int ret;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0) {
+                ret = (ret == -ENOENT) ? -EINVAL : ret;
+                goto out;
+        } else if (!ret) {
+                printk(KERN_ERR "NILFS: The specified checkpoint is "
+                       "not a snapshot (checkpoint number=%llu).\n",
+                       (unsigned long long)cno);
+                ret = -EINVAL;
+                goto out;
+        }
+        ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
+        if (ret) {
+                printk(KERN_ERR "NILFS: error loading snapshot "
+                       "(checkpoint number=%llu).\n",
+               (unsigned long long)cno);
+                goto out;
+        }
+        ret = nilfs_get_root_dentry(s, root, root_dentry);
+        nilfs_put_root(root);
+ out:
+        return ret;
+}
+static int nilfs_tree_was_touched(struct dentry *root_dentry)
+{
+        return root_dentry->d_count > 1;
+}
+/**
+ * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint
+ * @root_dentry: root dentry of the tree to be shrunk
+ *
+ * This function returns true if the tree was in-use.
+ */
+static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
+{
+        if (have_submounts(root_dentry))
+                return true;
+        shrink_dcache_parent(root_dentry);
+        return nilfs_tree_was_touched(root_dentry);
+}
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
+{
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+        struct nilfs_root *root;
+        struct inode *inode;
+        struct dentry *dentry;
+        int ret;
+        if (cno < 0 || cno > nilfs->ns_cno)
+                return false;
+        if (cno >= nilfs_last_cno(nilfs))
+                return true;    /* protect recent checkpoints */
+        ret = false;
+        root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+        if (root) {
+                inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
+                if (inode) {
+                        dentry = d_find_alias(inode);
+                        if (dentry) {
+                                if (nilfs_tree_was_touched(dentry))
+                                        ret = nilfs_try_to_shrink_tree(dentry);
+                                dput(dentry);
+                        }
+                        iput(inode);
+                }
+                nilfs_put_root(root);
+        }
+        return ret;
+}
 /**
 * nilfs_fill_super() - initialize a super block instance
 * @sb: super_block
 * @data: mount options
 * @silent: silent mode flag
- * @nilfs: the_nilfs struct
 *
 * This function is called exclusively by nilfs->ns_mount_mutex.
 * So, the recovery process is protected from other simultaneous mounts.
 */
 static int
-nilfs_fill_super(struct super_block *sb, void *data, int silent,
+nilfs_fill_super(struct super_block *sb, void *data, int silent)
-                 struct the_nilfs *nilfs)
 {
+        struct the_nilfs *nilfs;
        struct nilfs_sb_info *sbi;
-        struct inode *root;
+        struct nilfs_root *fsroot;
+        struct backing_dev_info *bdi;
        __u64 cno;
        int err;
@@ -822,19 +928,21 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                return -ENOMEM;
        sb->s_fs_info = sbi;
+        sbi->s_super = sb;
-        get_nilfs(nilfs);
+        nilfs = alloc_nilfs(sb->s_bdev);
+        if (!nilfs) {
+                err = -ENOMEM;
+                goto failed_sbi;
+        }
        sbi->s_nilfs = nilfs;
-        sbi->s_super = sb;
-        atomic_set(&sbi->s_count, 1);
        err = init_nilfs(nilfs, sbi, (char *)data);
        if (err)
-                goto failed_sbi;
+                goto failed_nilfs;
        spin_lock_init(&sbi->s_inode_lock);
        INIT_LIST_HEAD(&sbi->s_dirty_files);
-        INIT_LIST_HEAD(&sbi->s_list);
        /*
         * Following initialization is overlapped because
@@ -850,94 +958,59 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_export_op = &nilfs_export_ops;
        sb->s_root = NULL;
        sb->s_time_gran = 1;
-        sb->s_bdi = nilfs->ns_bdi;
+        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+        sb->s_bdi = bdi ? : &default_backing_dev_info;
        err = load_nilfs(nilfs, sbi);
        if (err)
-                goto failed_sbi;
+                goto failed_nilfs;
        cno = nilfs_last_cno(nilfs);
+        err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
-        if (sb->s_flags & MS_RDONLY) {
-                if (nilfs_test_opt(sbi, SNAPSHOT)) {
-                        down_read(&nilfs->ns_segctor_sem);
-                        err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
-                                                       sbi->s_snapshot_cno);
-                        up_read(&nilfs->ns_segctor_sem);
-                        if (err < 0) {
-                                if (err == -ENOENT)
-                                        err = -EINVAL;
-                                goto failed_sbi;
-                        }
-                        if (!err) {
-                                printk(KERN_ERR
-                                       "NILFS: The specified checkpoint is "
-                                       "not a snapshot "
-                                       "(checkpoint number=%llu).\n",
-                                       (unsigned long long)sbi->s_snapshot_cno);
-                                err = -EINVAL;
-                                goto failed_sbi;
-                        }
-                        cno = sbi->s_snapshot_cno;
-                }
-        }
-        err = nilfs_attach_checkpoint(sbi, cno);
        if (err) {
-                printk(KERN_ERR "NILFS: error loading a checkpoint"
+                printk(KERN_ERR "NILFS: error loading last checkpoint "
-                       " (checkpoint number=%llu).\n", (unsigned long long)cno);
+                       "(checkpoint number=%llu).\n", (unsigned long long)cno);
-                goto failed_sbi;
+                goto failed_unload;
        }
        if (!(sb->s_flags & MS_RDONLY)) {
-                err = nilfs_attach_segment_constructor(sbi);
+                err = nilfs_attach_segment_constructor(sbi, fsroot);
                if (err)
                        goto failed_checkpoint;
        }
-        root = nilfs_iget(sb, NILFS_ROOT_INO);
+        err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
-        if (IS_ERR(root)) {
+        if (err)
-                printk(KERN_ERR "NILFS: get root inode failed\n");
-                err = PTR_ERR(root);
-                goto failed_segctor;
-        }
-        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
-                iput(root);
-                printk(KERN_ERR "NILFS: corrupt root inode.\n");
-                err = -EINVAL;
-                goto failed_segctor;
-        }
-        sb->s_root = d_alloc_root(root);
-        if (!sb->s_root) {
-                iput(root);
-                printk(KERN_ERR "NILFS: get root dentry failed\n");
-                err = -ENOMEM;
                goto failed_segctor;
-        }
+        nilfs_put_root(fsroot);
        if (!(sb->s_flags & MS_RDONLY)) {
                down_write(&nilfs->ns_sem);
-                nilfs_setup_super(sbi);
+                nilfs_setup_super(sbi, true);
                up_write(&nilfs->ns_sem);
        }
-        down_write(&nilfs->ns_super_sem);
-        if (!nilfs_test_opt(sbi, SNAPSHOT))
-                nilfs->ns_current = sbi;
-        up_write(&nilfs->ns_super_sem);
        return 0;
 failed_segctor:
        nilfs_detach_segment_constructor(sbi);
 failed_checkpoint:
-        nilfs_detach_checkpoint(sbi);
+        nilfs_put_root(fsroot);
+ failed_unload:
+        iput(nilfs->ns_sufile);
+        iput(nilfs->ns_cpfile);
+        iput(nilfs->ns_dat);
+ failed_nilfs:
+        destroy_nilfs(nilfs);
 failed_sbi:
-        put_nilfs(nilfs);
        sb->s_fs_info = NULL;
-        nilfs_put_sbinfo(sbi);
+        kfree(sbi);
        return err;
 }
@@ -946,16 +1019,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
-        struct nilfs_mount_options old_opts;
+        unsigned long old_mount_opt;
-        int was_snapshot, err;
+        int err;
-        lock_kernel();
-        down_write(&nilfs->ns_super_sem);
        old_sb_flags = sb->s_flags;
-        old_opts.mount_opt = sbi->s_mount_opt;
+        old_mount_opt = sbi->s_mount_opt;
-        old_opts.snapshot_cno = sbi->s_snapshot_cno;
-        was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
        if (!parse_options(data, sb, 1)) {
                err = -EINVAL;
@@ -964,11 +1032,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
        err = -EINVAL;
-        if (was_snapshot && !(*flags & MS_RDONLY)) {
-                printk(KERN_ERR "NILFS (device %s): cannot remount snapshot "
-                       "read/write.\n", sb->s_id);
-                goto restore_opts;
-        }
        if (!nilfs_valid_fs(nilfs)) {
                printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -993,6 +1056,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                up_write(&nilfs->ns_sem);
        } else {
                __u64 features;
+                struct nilfs_root *root;
                /*
                 * Mounting a RDONLY partition read-write, so reread and
@@ -1014,25 +1078,21 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                sb->s_flags &= ~MS_RDONLY;
-                err = nilfs_attach_segment_constructor(sbi);
+                root = NILFS_I(sb->s_root->d_inode)->i_root;
+                err = nilfs_attach_segment_constructor(sbi, root);
                if (err)
                        goto restore_opts;
                down_write(&nilfs->ns_sem);
-                nilfs_setup_super(sbi);
+                nilfs_setup_super(sbi, true);
                up_write(&nilfs->ns_sem);
        }
 out:
-        up_write(&nilfs->ns_super_sem);
-        unlock_kernel();
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
-        sbi->s_mount_opt = old_opts.mount_opt;
+        sbi->s_mount_opt = old_mount_opt;
-        sbi->s_snapshot_cno = old_opts.snapshot_cno;
-        up_write(&nilfs->ns_super_sem);
-        unlock_kernel();
        return err;
 }
@@ -1052,7 +1112,7 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
 {
        char *p, *options = data;
        substring_t args[MAX_OPT_ARGS];
-        int option, token;
+        int token;
        int ret = 0;
        do {
@@ -1060,16 +1120,18 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
                if (p != NULL && *p) {
                        token = match_token(p, tokens, args);
                        if (token == Opt_snapshot) {
-                                if (!(sd->flags & MS_RDONLY))
+                                if (!(sd->flags & MS_RDONLY)) {
                                        ret++;
-                                else {
+                                } else {
-                                        ret = match_int(&args[0], &option);
+                                        sd->cno = simple_strtoull(args[0].from,
-                                        if (!ret) {
+                                                                  NULL, 0);
-                                                if (option > 0)
+                                        /*
-                                                        sd->cno = option;
+                                         * No need to see the end pointer;
-                                                else
+                                         * match_token() has done syntax
-                                                        ret++;
+                                         * checking.
-                                        }
+                                         */
+                                        if (sd->cno == 0)
+                                                ret++;
                                }
                        }
                        if (ret)
@@ -1086,43 +1148,33 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
 static int nilfs_set_bdev_super(struct super_block *s, void *data)
 {
-        struct nilfs_super_data *sd = data;
+        s->s_bdev = data;
-        s->s_bdev = sd->bdev;
        s->s_dev = s->s_bdev->bd_dev;
        return 0;
 }
 static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
-        struct nilfs_super_data *sd = data;
+        return (void *)s->s_bdev == data;
-        return sd->sbi && s->s_fs_info == (void *)sd->sbi;
 }
-static int
+static struct dentry *
-nilfs_get_sb(struct file_system_type *fs_type, int flags,
+nilfs_mount(struct file_system_type *fs_type, int flags,
-             const char *dev_name, void *data, struct vfsmount *mnt)
+             const char *dev_name, void *data)
 {
        struct nilfs_super_data sd;
        struct super_block *s;
-        fmode_t mode = FMODE_READ;
+        fmode_t mode = FMODE_READ | FMODE_EXCL;
-        struct the_nilfs *nilfs;
+        struct dentry *root_dentry;
-        int err, need_to_close = 1;
+        int err, s_new = false;
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
-        sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        if (IS_ERR(sd.bdev))
-                return PTR_ERR(sd.bdev);
+                return ERR_CAST(sd.bdev);
-        /*
-         * To get mount instance using sget() vfs-routine, NILFS needs
-         * much more information than normal filesystems to identify mount
-         * instance.  For snapshot mounts, not only a mount type (ro-mount
-         * or rw-mount) but also a checkpoint number is required.
-         */
        sd.cno = 0;
        sd.flags = flags;
        if (nilfs_identify((char *)data, &sd)) {
@@ -1130,101 +1182,91 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                goto failed;
        }
-        nilfs = find_or_create_nilfs(sd.bdev);
-        if (!nilfs) {
-                err = -ENOMEM;
-                goto failed;
-        }
-        mutex_lock(&nilfs->ns_mount_mutex);
-        if (!sd.cno) {
-                /*
-                 * Check if an exclusive mount exists or not.
-                 * Snapshot mounts coexist with a current mount
-                 * (i.e. rw-mount or ro-mount), whereas rw-mount and
-                 * ro-mount are mutually exclusive.
-                 */
-                down_read(&nilfs->ns_super_sem);
-                if (nilfs->ns_current &&
-                    ((nilfs->ns_current->s_super->s_flags ^ flags)
-                     & MS_RDONLY)) {
-                        up_read(&nilfs->ns_super_sem);
-                        err = -EBUSY;
-                        goto failed_unlock;
-                }
-                up_read(&nilfs->ns_super_sem);
-        }
-        /*
-         * Find existing nilfs_sb_info struct
-         */
-        sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
        /*
-         * Get super block instance holding the nilfs_sb_info struct.
+         * once the super is inserted into the list by sget, s_umount
-         * A new instance is allocated if no existing mount is present or
+         * will protect the lockfs code from trying to start a snapshot
-         * existing instance has been unmounted.
+         * while we are mounting
         */
-        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
+        mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
-        if (sd.sbi)
+        if (sd.bdev->bd_fsfreeze_count > 0) {
-                nilfs_put_sbinfo(sd.sbi);
+                mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
+                err = -EBUSY;
+                goto failed;
+        }
+        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
+        mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
        if (IS_ERR(s)) {
                err = PTR_ERR(s);
-                goto failed_unlock;
+                goto failed;
        }
        if (!s->s_root) {
                char b[BDEVNAME_SIZE];
+                s_new = true;
                /* New superblock instance created */
                s->s_flags = flags;
                s->s_mode = mode;
                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(sd.bdev));
-                err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
+                err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-                                       nilfs);
                if (err)
-                        goto cancel_new;
+                        goto failed_super;
                s->s_flags |= MS_ACTIVE;
-                need_to_close = 0;
+        } else if (!sd.cno) {
+                int busy = false;
+                if (nilfs_tree_was_touched(s->s_root)) {
+                        busy = nilfs_try_to_shrink_tree(s->s_root);
+                        if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
+                                printk(KERN_ERR "NILFS: the device already "
+                                       "has a %s mount.\n",
+                                       (s->s_flags & MS_RDONLY) ?
+                                       "read-only" : "read/write");
+                                err = -EBUSY;
+                                goto failed_super;
+                        }
+                }
+                if (!busy) {
+                        /*
+                         * Try remount to setup mount states if the current
+                         * tree is not mounted and only snapshots use this sb.
+                         */
+                        err = nilfs_remount(s, &flags, data);
+                        if (err)
+                                goto failed_super;
+                }
        }
-        mutex_unlock(&nilfs->ns_mount_mutex);
+        if (sd.cno) {
-        put_nilfs(nilfs);
+                err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
-        if (need_to_close)
+                if (err)
-                close_bdev_exclusive(sd.bdev, mode);
+                        goto failed_super;
-        simple_set_mnt(mnt, s);
+        } else {
-        return 0;
+                root_dentry = dget(s->s_root);
+        }
- failed_unlock:
+        if (!s_new)
-        mutex_unlock(&nilfs->ns_mount_mutex);
+                blkdev_put(sd.bdev, mode);
-        put_nilfs(nilfs);
- failed:
-        close_bdev_exclusive(sd.bdev, mode);
-        return err;
+        return root_dentry;
- cancel_new:
+ failed_super:
-        /* Abandoning the newly allocated superblock */
-        mutex_unlock(&nilfs->ns_mount_mutex);
-        put_nilfs(nilfs);
        deactivate_locked_super(s);
-        /*
-         * deactivate_locked_super() invokes close_bdev_exclusive().
+ failed:
-         * We must finish all post-cleaning before this call;
+        if (!s_new)
-         * put_nilfs() needs the block device.
+                blkdev_put(sd.bdev, mode);
-         */
+        return ERR_PTR(err);
-        return err;
 }
 struct file_system_type nilfs_fs_type = {
        .owner    = THIS_MODULE,
        .name     = "nilfs2",
-        .get_sb   = nilfs_get_sb,
+        .mount    = nilfs_mount,
        .kill_sb  = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ba7c10c917fc..ad4ac607cf57 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,9 +35,6 @@
 #include "segbuf.h"
-static LIST_HEAD(nilfs_objects);
-static DEFINE_SPINLOCK(nilfs_lock);
 static int nilfs_valid_sb(struct nilfs_super_block *sbp);
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
@@ -61,16 +58,13 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
 }
 /**
- * alloc_nilfs - allocate the_nilfs structure
+ * alloc_nilfs - allocate a nilfs object
 * @bdev: block device to which the_nilfs is related
 *
- * alloc_nilfs() allocates memory for the_nilfs and
- * initializes its reference count and locks.
- *
 * Return Value: On success, pointer to the_nilfs is returned.
 * On error, NULL is returned.
 */
-static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 {
        struct the_nilfs *nilfs;
@@ -79,103 +73,38 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
                return NULL;
        nilfs->ns_bdev = bdev;
-        atomic_set(&nilfs->ns_count, 1);
        atomic_set(&nilfs->ns_ndirtyblks, 0);
        init_rwsem(&nilfs->ns_sem);
-        init_rwsem(&nilfs->ns_super_sem);
+        INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
-        mutex_init(&nilfs->ns_mount_mutex);
-        init_rwsem(&nilfs->ns_writer_sem);
-        INIT_LIST_HEAD(&nilfs->ns_list);
-        INIT_LIST_HEAD(&nilfs->ns_supers);
        spin_lock_init(&nilfs->ns_last_segment_lock);
-        nilfs->ns_gc_inodes_h = NULL;
+        nilfs->ns_cptree = RB_ROOT;
+        spin_lock_init(&nilfs->ns_cptree_lock);
        init_rwsem(&nilfs->ns_segctor_sem);
        return nilfs;
 }
 /**
- * find_or_create_nilfs - find or create nilfs object
+ * destroy_nilfs - destroy nilfs object
- * @bdev: block device to which the_nilfs is related
+ * @nilfs: nilfs object to be released
- *
- * find_nilfs() looks up an existent nilfs object created on the
- * device and gets the reference count of the object.  If no nilfs object
- * is found on the device, a new nilfs object is allocated.
- *
- * Return Value: On success, pointer to the nilfs object is returned.
- * On error, NULL is returned.
- */
-struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
-{
-        struct the_nilfs *nilfs, *new = NULL;
- retry:
-        spin_lock(&nilfs_lock);
-        list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
-                if (nilfs->ns_bdev == bdev) {
-                        get_nilfs(nilfs);
-                        spin_unlock(&nilfs_lock);
-                        if (new)
-                                put_nilfs(new);
-                        return nilfs; /* existing object */
-                }
-        }
-        if (new) {
-                list_add_tail(&new->ns_list, &nilfs_objects);
-                spin_unlock(&nilfs_lock);
-                return new; /* new object */
-        }
-        spin_unlock(&nilfs_lock);
-        new = alloc_nilfs(bdev);
-        if (new)
-                goto retry;
-        return NULL; /* insufficient memory */
-}
-/**
- * put_nilfs - release a reference to the_nilfs
- * @nilfs: the_nilfs structure to be released
- *
- * put_nilfs() decrements a reference counter of the_nilfs.
- * If the reference count reaches zero, the_nilfs is freed.
 */
-void put_nilfs(struct the_nilfs *nilfs)
+void destroy_nilfs(struct the_nilfs *nilfs)
 {
-        spin_lock(&nilfs_lock);
-        if (!atomic_dec_and_test(&nilfs->ns_count)) {
-                spin_unlock(&nilfs_lock);
-                return;
-        }
-        list_del_init(&nilfs->ns_list);
-        spin_unlock(&nilfs_lock);
-        /*
-         * Increment of ns_count never occurs below because the caller
-         * of get_nilfs() holds at least one reference to the_nilfs.
-         * Thus its exclusion control is not required here.
-         */
        might_sleep();
-        if (nilfs_loaded(nilfs)) {
-                nilfs_mdt_destroy(nilfs->ns_sufile);
-                nilfs_mdt_destroy(nilfs->ns_cpfile);
-                nilfs_mdt_destroy(nilfs->ns_dat);
-                nilfs_mdt_destroy(nilfs->ns_gc_dat);
-        }
        if (nilfs_init(nilfs)) {
-                nilfs_destroy_gccache(nilfs);
                brelse(nilfs->ns_sbh[0]);
                brelse(nilfs->ns_sbh[1]);
        }
        kfree(nilfs);
 }
-static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
+static int nilfs_load_super_root(struct the_nilfs *nilfs,
+                                 struct super_block *sb, sector_t sr_block)
 {
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        struct nilfs_inode *rawi;
        unsigned dat_entry_size, segment_usage_size, checkpoint_size;
        unsigned inode_size;
        int err;
@@ -192,40 +121,22 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
        inode_size = nilfs->ns_inode_size;
-        err = -ENOMEM;
+        rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
-        nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size);
+        err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
-        if (unlikely(!nilfs->ns_dat))
+        if (err)
                goto failed;
-        nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size);
+        rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
-        if (unlikely(!nilfs->ns_gc_dat))
+        err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
+        if (err)
                goto failed_dat;
-        nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
+        rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
-        if (unlikely(!nilfs->ns_cpfile))
+        err = nilfs_sufile_read(sb, segment_usage_size, rawi,
-                goto failed_gc_dat;
+                                &nilfs->ns_sufile);
+        if (err)
-        nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
-        if (unlikely(!nilfs->ns_sufile))
                goto failed_cpfile;
-        nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
-        err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
-                             NILFS_SR_DAT_OFFSET(inode_size));
-        if (unlikely(err))
-                goto failed_sufile;
-        err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
-                                NILFS_SR_CPFILE_OFFSET(inode_size));
-        if (unlikely(err))
-                goto failed_sufile;
-        err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
-                                NILFS_SR_SUFILE_OFFSET(inode_size));
-        if (unlikely(err))
-                goto failed_sufile;
        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
        nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
@@ -233,17 +144,11 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
        brelse(bh_sr);
        return err;
- failed_sufile:
-        nilfs_mdt_destroy(nilfs->ns_sufile);
 failed_cpfile:
-        nilfs_mdt_destroy(nilfs->ns_cpfile);
+        iput(nilfs->ns_cpfile);
- failed_gc_dat:
-        nilfs_mdt_destroy(nilfs->ns_gc_dat);
 failed_dat:
-        nilfs_mdt_destroy(nilfs->ns_dat);
+        iput(nilfs->ns_dat);
        goto failed;
 }
@@ -306,15 +211,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        int valid_fs = nilfs_valid_fs(nilfs);
        int err;
-        if (nilfs_loaded(nilfs)) {
-                if (valid_fs ||
-                    ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
-                        return 0;
-                printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
-                       "recovery state.\n");
-                return -EINVAL;
-        }
        if (!valid_fs) {
                printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
                if (s_flags & MS_RDONLY) {
@@ -375,7 +271,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                        goto scan_error;
        }
-        err = nilfs_load_super_root(nilfs, ri.ri_super_root);
+        err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
        if (unlikely(err)) {
                printk(KERN_ERR "NILFS: error loading super root.\n");
                goto failed;
@@ -433,7 +329,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        printk(KERN_INFO "NILFS: recovery complete.\n");
 skip_recovery:
-        set_nilfs_loaded(nilfs);
        nilfs_clear_recovery_info(&ri);
        sbi->s_super->s_flags = s_flags;
        return 0;
@@ -443,10 +338,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        goto failed;
 failed_unload:
-        nilfs_mdt_destroy(nilfs->ns_cpfile);
+        iput(nilfs->ns_cpfile);
-        nilfs_mdt_destroy(nilfs->ns_sufile);
+        iput(nilfs->ns_sufile);
-        nilfs_mdt_destroy(nilfs->ns_dat);
+        iput(nilfs->ns_dat);
-        nilfs_mdt_destroy(nilfs->ns_gc_dat);
 failed:
        nilfs_clear_recovery_info(&ri);
@@ -468,8 +362,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
 static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
                                   struct nilfs_super_block *sbp)
 {
-        if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
+        if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
-                printk(KERN_ERR "NILFS: revision mismatch "
+                printk(KERN_ERR "NILFS: unsupported revision "
                       "(superblock rev.=%d.%d, current rev.=%d.%d). "
                       "Please check the version of mkfs.nilfs.\n",
                       le32_to_cpu(sbp->s_rev_level),
@@ -631,12 +525,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 *
 * init_nilfs() performs common initialization per block device (e.g.
 * reading the super block, getting disk layout information, initializing
- * shared fields in the_nilfs). It takes on some portion of the jobs
+ * shared fields in the_nilfs).
- * typically done by a fill_super() routine. This division arises from
- * the nature that multiple NILFS instances may be simultaneously
- * mounted on a device.
- * For multiple mounts on the same device, only the first mount
- * invokes these tasks.
 *
 * Return Value: On success, 0 is returned. On error, a negative error
 * code is returned.
@@ -645,32 +534,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 {
        struct super_block *sb = sbi->s_super;
        struct nilfs_super_block *sbp;
-        struct backing_dev_info *bdi;
        int blocksize;
        int err;
        down_write(&nilfs->ns_sem);
-        if (nilfs_init(nilfs)) {
-                /* Load values from existing the_nilfs */
-                sbp = nilfs->ns_sbp[0];
-                err = nilfs_store_magic_and_option(sb, sbp, data);
-                if (err)
-                        goto out;
-                err = nilfs_check_feature_compatibility(sb, sbp);
-                if (err)
-                        goto out;
-                blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
-                if (sb->s_blocksize != blocksize &&
-                    !sb_set_blocksize(sb, blocksize)) {
-                        printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
-                               blocksize);
-                        err = -EINVAL;
-                }
-                sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
-                goto out;
-        }
        blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
        if (!blocksize) {
@@ -729,18 +596,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
-        bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
-        nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
        err = nilfs_store_log_cursor(nilfs, sbp);
        if (err)
                goto failed_sbh;
-        /* Initialize gcinode cache */
-        err = nilfs_init_gccache(nilfs);
-        if (err)
-                goto failed_sbh;
        set_nilfs_init(nilfs);
        err = 0;
 out:
@@ -775,9 +634,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                        ret = blkdev_issue_discard(nilfs->ns_bdev,
                                                   start * sects_per_block,
                                                   nblocks * sects_per_block,
-                                                   GFP_NOFS,
+                                                   GFP_NOFS, 0);
-                                                   BLKDEV_IFL_WAIT |
-                                                   BLKDEV_IFL_BARRIER);
                        if (ret < 0)
                                return ret;
                        nblocks = 0;
@@ -787,19 +644,17 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                ret = blkdev_issue_discard(nilfs->ns_bdev,
                                           start * sects_per_block,
                                           nblocks * sects_per_block,
-                                           GFP_NOFS,
+                                           GFP_NOFS, 0);
-                                          BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
        return ret;
 }
 int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
-        struct inode *dat = nilfs_dat_inode(nilfs);
        unsigned long ncleansegs;
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
        return 0;
 }
@@ -815,79 +670,92 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
        return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
 }
-/**
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
- * nilfs_find_sbinfo - find existing nilfs_sb_info structure
- * @nilfs: nilfs object
- * @rw_mount: mount type (non-zero value for read/write mount)
- * @cno: checkpoint number (zero for read-only mount)
- *
- * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
- * @rw_mount and @cno (in case of snapshots) matched.  If no instance
- * was found, NULL is returned.  Although the super block instance can
- * be unmounted after this function returns, the nilfs_sb_info struct
- * is kept on memory until nilfs_put_sbinfo() is called.
- */
-struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
-                                        int rw_mount, __u64 cno)
 {
-        struct nilfs_sb_info *sbi;
+        struct rb_node *n;
+        struct nilfs_root *root;
-        down_read(&nilfs->ns_super_sem);
-        /*
+        spin_lock(&nilfs->ns_cptree_lock);
-         * The SNAPSHOT flag and sb->s_flags are supposed to be
+        n = nilfs->ns_cptree.rb_node;
-         * protected with nilfs->ns_super_sem.
+        while (n) {
-         */
+                root = rb_entry(n, struct nilfs_root, rb_node);
-        sbi = nilfs->ns_current;
-        if (rw_mount) {
+                if (cno < root->cno) {
-                if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
+                        n = n->rb_left;
-                        goto found; /* read/write mount */
+                } else if (cno > root->cno) {
-                else
+                        n = n->rb_right;
-                        goto out;
+                } else {
-        } else if (cno == 0) {
+                        atomic_inc(&root->count);
-                if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
+                        spin_unlock(&nilfs->ns_cptree_lock);
-                        goto found; /* read-only mount */
+                        return root;
-                else
+                }
-                        goto out;
        }
+        spin_unlock(&nilfs->ns_cptree_lock);
-        list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
-                if (nilfs_test_opt(sbi, SNAPSHOT) &&
-                    sbi->s_snapshot_cno == cno)
-                        goto found; /* snapshot mount */
-        }
- out:
-        up_read(&nilfs->ns_super_sem);
        return NULL;
- found:
-        atomic_inc(&sbi->s_count);
-        up_read(&nilfs->ns_super_sem);
-        return sbi;
 }
-int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
+struct nilfs_root *
-                                int snapshot_mount)
+nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 {
-        struct nilfs_sb_info *sbi;
+        struct rb_node **p, *parent;
-        int ret = 0;
+        struct nilfs_root *root, *new;
-        down_read(&nilfs->ns_super_sem);
+        root = nilfs_lookup_root(nilfs, cno);
-        if (cno == 0 || cno > nilfs->ns_cno)
+        if (root)
-                goto out_unlock;
+                return root;
-        list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+        new = kmalloc(sizeof(*root), GFP_KERNEL);
-                if (sbi->s_snapshot_cno == cno &&
+        if (!new)
-                    (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
+                return NULL;
-                                        /* exclude read-only mounts */
-                        ret++;
+        spin_lock(&nilfs->ns_cptree_lock);
-                        break;
+        p = &nilfs->ns_cptree.rb_node;
+        parent = NULL;
+        while (*p) {
+                parent = *p;
+                root = rb_entry(parent, struct nilfs_root, rb_node);
+                if (cno < root->cno) {
+                        p = &(*p)->rb_left;
+                } else if (cno > root->cno) {
+                        p = &(*p)->rb_right;
+                } else {
+                        atomic_inc(&root->count);
+                        spin_unlock(&nilfs->ns_cptree_lock);
+                        kfree(new);
+                        return root;
                }
        }
-        /* for protecting recent checkpoints */
-        if (cno >= nilfs_last_cno(nilfs))
-                ret++;
- out_unlock:
+        new->cno = cno;
-        up_read(&nilfs->ns_super_sem);
+        new->ifile = NULL;
-        return ret;
+        new->nilfs = nilfs;
+        atomic_set(&new->count, 1);
+        atomic_set(&new->inodes_count, 0);
+        atomic_set(&new->blocks_count, 0);
+        rb_link_node(&new->rb_node, parent, p);
+        rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
+        spin_unlock(&nilfs->ns_cptree_lock);
+        return new;
+}
+void nilfs_put_root(struct nilfs_root *root)
+{
+        if (atomic_dec_and_test(&root->count)) {
+                struct the_nilfs *nilfs = root->nilfs;
+                spin_lock(&nilfs->ns_cptree_lock);
+                rb_erase(&root->rb_node, &nilfs->ns_cptree);
+                spin_unlock(&nilfs->ns_cptree_lock);
+                if (root->ifile)
+                        iput(root->ifile);
+                kfree(root);
+        }
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index f785a7b0ab99..fd85e4c05c6b 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -26,6 +26,7 @@
 #include <linux/types.h>
 #include <linux/buffer_head.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -35,8 +36,6 @@
 /* the_nilfs struct */
 enum {
        THE_NILFS_INIT = 0,     /* Information from super_block is set */
-        THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
-                                   the latest checkpoint was loaded */
        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
        THE_NILFS_GC_RUNNING,   /* gc process is running */
        THE_NILFS_SB_DIRTY,     /* super block is dirty */
@@ -45,22 +44,13 @@ enum {
 /**
 * struct the_nilfs - struct to supervise multiple nilfs mount points
 * @ns_flags: flags
- * @ns_count: reference count
- * @ns_list: list head for nilfs_list
 * @ns_bdev: block device
- * @ns_bdi: backing dev info
- * @ns_writer: back pointer to writable nilfs_sb_info
 * @ns_sem: semaphore for shared states
- * @ns_super_sem: semaphore for global operations across super block instances
- * @ns_mount_mutex: mutex protecting mount process of nilfs
- * @ns_writer_sem: semaphore protecting ns_writer attach/detach
- * @ns_current: back pointer to current mount
 * @ns_sbh: buffer heads of on-disk super blocks
 * @ns_sbp: pointers to super block data
 * @ns_sbwtime: previous write time of super block
 * @ns_sbwcount: write count of super block
 * @ns_sbsize: size of valid data in super block
- * @ns_supers: list of nilfs super block structs
 * @ns_seg_seq: segment sequence counter
 * @ns_segnum: index number of the latest full segment.
 * @ns_nextnum: index number of the full segment index to be used next
@@ -79,9 +69,9 @@ enum {
 * @ns_dat: DAT file inode
 * @ns_cpfile: checkpoint file inode
 * @ns_sufile: segusage file inode
- * @ns_gc_dat: shadow inode of the DAT file inode for GC
+ * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
+ * @ns_cptree_lock: lock protecting @ns_cptree
 * @ns_gc_inodes: dummy inodes to keep live blocks
- * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
 * @ns_blocksize_bits: bit length of block size
 * @ns_blocksize: block size
 * @ns_nsegments: number of segments in filesystem
@@ -95,22 +85,9 @@ enum {
 */
 struct the_nilfs {
        unsigned long           ns_flags;
-        atomic_t                ns_count;
-        struct list_head        ns_list;
        struct block_device    *ns_bdev;
-        struct backing_dev_info *ns_bdi;
-        struct nilfs_sb_info   *ns_writer;
        struct rw_semaphore     ns_sem;
-        struct rw_semaphore     ns_super_sem;
-        struct mutex            ns_mount_mutex;
-        struct rw_semaphore     ns_writer_sem;
-        /*
-         * components protected by ns_super_sem
-         */
-        struct nilfs_sb_info   *ns_current;
-        struct list_head        ns_supers;
        /*
         * used for
@@ -163,11 +140,13 @@ struct the_nilfs {
        struct inode           *ns_dat;
        struct inode           *ns_cpfile;
        struct inode           *ns_sufile;
-        struct inode           *ns_gc_dat;
-        /* GC inode list and hash table head */
+        /* Checkpoint tree */
+        struct rb_root          ns_cptree;
+        spinlock_t              ns_cptree_lock;
+        /* GC inode list */
        struct list_head        ns_gc_inodes;
-        struct hlist_head      *ns_gc_inodes_h;
        /* Disk layout information (static) */
        unsigned int            ns_blocksize_bits;
@@ -182,9 +161,6 @@ struct the_nilfs {
        u32                     ns_crc_seed;
 };
-#define NILFS_GCINODE_HASH_BITS         8
-#define NILFS_GCINODE_HASH_SIZE         (1<<NILFS_GCINODE_HASH_BITS)
 #define THE_NILFS_FNS(bit, name)                                        \
 static inline void set_nilfs_##name(struct the_nilfs *nilfs)            \
 {                                                                       \
@@ -200,11 +176,36 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 }
 THE_NILFS_FNS(INIT, init)
-THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
+/**
+ * struct nilfs_root - nilfs root object
+ * @cno: checkpoint number
+ * @rb_node: red-black tree node
+ * @count: refcount of this structure
+ * @nilfs: nilfs object
+ * @ifile: inode file
+ * @root: root inode
+ * @inodes_count: number of inodes
+ * @blocks_count: number of blocks (Reserved)
+ */
+struct nilfs_root {
+        __u64 cno;
+        struct rb_node rb_node;
+        atomic_t count;
+        struct the_nilfs *nilfs;
+        struct inode *ifile;
+        atomic_t inodes_count;
+        atomic_t blocks_count;
+};
+/* Special checkpoint number */
+#define NILFS_CPTREE_CURRENT_CNO        0
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ           10
@@ -221,46 +222,25 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
 }
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *find_or_create_nilfs(struct block_device *);
+struct the_nilfs *alloc_nilfs(struct block_device *bdev);
-void put_nilfs(struct the_nilfs *);
+void destroy_nilfs(struct the_nilfs *nilfs);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
+struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
+                                             __u64 cno);
+void nilfs_put_root(struct nilfs_root *root);
 struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
-int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
 void nilfs_swap_super_block(struct the_nilfs *);
-static inline void get_nilfs(struct the_nilfs *nilfs)
+static inline void nilfs_get_root(struct nilfs_root *root)
-{
-        /* Caller must have at least one reference of the_nilfs. */
-        atomic_inc(&nilfs->ns_count);
-}
-static inline void
-nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
-{
-        down_write(&nilfs->ns_writer_sem);
-        nilfs->ns_writer = sbi;
-        up_write(&nilfs->ns_writer_sem);
-}
-static inline void
-nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
-{
-        down_write(&nilfs->ns_writer_sem);
-        if (sbi == nilfs->ns_writer)
-                nilfs->ns_writer = NULL;
-        up_write(&nilfs->ns_writer_sem);
-}
-static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
 {
-        if (atomic_dec_and_test(&sbi->s_count))
+        atomic_inc(&root->count);
-                kfree(sbi);
 }
 static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
diff --git a/fs/no-block.c b/fs/no-block.c
index d269a93d3467..6e40e42a43de 100644
--- a/fs/no-block.c
+++ b/fs/no-block.c
@@ -19,4 +19,5 @@ static int no_blkdev_open(struct inode * inode, struct file * filp)
 const struct file_operations def_blk_fops = {
        .open           = no_blkdev_open,
+        .llseek         = noop_llseek,
 };
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index b388443c3a09..22c629eedd82 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
-#source "fs/notify/fanotify/Kconfig"
+source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 3ac36b7bf6b9..7dceff005a67 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -6,7 +6,7 @@ config FANOTIFY
        ---help---
           Say Y here to enable fanotify suport.  fanotify is a file access
           notification system which differs from inotify in that it sends
-           and open file descriptor to the userspace listener along with
+           an open file descriptor to the userspace listener along with
           the event.
           If unsure, say Y.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 85366c78cc37..f35794b97e8e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        wait_event(group->fanotify_data.access_waitq, event->response);
+        wait_event(group->fanotify_data.access_waitq, event->response ||
+                                atomic_read(&group->fanotify_data.bypass_perm));
+        if (!event->response) /* bypass_perm set */
+                return 0;
        /* userspace responded, convert to something usable */
        spin_lock(&event->lock);
@@ -131,6 +135,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
        BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
        BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
        BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+        BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
@@ -160,20 +165,21 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
                                       __u32 event_mask, void *data, int data_type)
 {
        __u32 marks_mask, marks_ignored_mask;
+        struct path *path = data;
        pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
                 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
                 inode_mark, vfsmnt_mark, event_mask, data, data_type);
-        /* sorry, fanotify only gives a damn about files and dirs */
-        if (!S_ISREG(to_tell->i_mode) &&
-            !S_ISDIR(to_tell->i_mode))
-                return false;
        /* if we don't have enough info to send an event to userspace say no */
        if (data_type != FSNOTIFY_EVENT_PATH)
                return false;
+        /* sorry, fanotify only gives a damn about files and dirs */
+        if (!S_ISREG(path->dentry->d_inode->i_mode) &&
+            !S_ISDIR(path->dentry->d_inode->i_mode))
+                return false;
        if (inode_mark && vfsmnt_mark) {
                marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
                marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
@@ -194,16 +200,29 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
                BUG();
        }
+        if (S_ISDIR(path->dentry->d_inode->i_mode) &&
+            (marks_ignored_mask & FS_ISDIR))
+                return false;
        if (event_mask & marks_mask & ~marks_ignored_mask)
                return true;
        return false;
 }
+static void fanotify_free_group_priv(struct fsnotify_group *group)
+{
+        struct user_struct *user;
+        user = group->fanotify_data.user;
+        atomic_dec(&user->fanotify_listeners);
+        free_uid(user);
+}
 const struct fsnotify_ops fanotify_fsnotify_ops = {
        .handle_event = fanotify_handle_event,
        .should_send_event = fanotify_should_send_event,
-        .free_group_priv = NULL,
+        .free_group_priv = fanotify_free_group_priv,
        .free_event_priv = NULL,
        .freeing_mark = NULL,
 };
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 5ed8e58d7bfc..8b61220cffc5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,10 @@
 #include <asm/ioctls.h>
+#define FANOTIFY_DEFAULT_MAX_EVENTS     16384
+#define FANOTIFY_DEFAULT_MAX_MARKS      8192
+#define FANOTIFY_DEFAULT_MAX_LISTENERS  128
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 static struct kmem_cache *fanotify_mark_cache __read_mostly;
@@ -102,20 +106,29 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
        return client_fd;
 }
-static ssize_t fill_event_metadata(struct fsnotify_group *group,
+static int fill_event_metadata(struct fsnotify_group *group,
                                   struct fanotify_event_metadata *metadata,
                                   struct fsnotify_event *event)
 {
+        int ret = 0;
        pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
                 group, metadata, event);
        metadata->event_len = FAN_EVENT_METADATA_LEN;
+        metadata->metadata_len = FAN_EVENT_METADATA_LEN;
        metadata->vers = FANOTIFY_METADATA_VERSION;
        metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
        metadata->pid = pid_vnr(event->tgid);
-        metadata->fd = create_fd(group, event);
+        if (unlikely(event->mask & FAN_Q_OVERFLOW))
+                metadata->fd = FAN_NOFD;
+        else {
+                metadata->fd = create_fd(group, event);
+                if (metadata->fd < 0)
+                        ret = metadata->fd;
+        }
-        return metadata->fd;
+        return ret;
 }
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -196,7 +209,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
        mutex_lock(&group->fanotify_data.access_mutex);
-        if (group->fanotify_data.bypass_perm) {
+        if (atomic_read(&group->fanotify_data.bypass_perm)) {
                mutex_unlock(&group->fanotify_data.access_mutex);
                kmem_cache_free(fanotify_response_event_cache, re);
                event->response = FAN_ALLOW;
@@ -253,24 +266,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        fd = fill_event_metadata(group, &fanotify_event_metadata, event);
+        ret = fill_event_metadata(group, &fanotify_event_metadata, event);
-        if (fd < 0)
+        if (ret < 0)
-                return fd;
+                goto out;
+        fd = fanotify_event_metadata.fd;
        ret = prepare_for_access_response(group, event, fd);
        if (ret)
                goto out_close_fd;
        ret = -EFAULT;
-        if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
+        if (copy_to_user(buf, &fanotify_event_metadata,
+                         fanotify_event_metadata.event_len))
                goto out_kill_access_response;
-        return FAN_EVENT_METADATA_LEN;
+        return fanotify_event_metadata.event_len;
 out_kill_access_response:
        remove_access_response(group, event, fd);
 out_close_fd:
-        sys_close(fd);
+        if (fd != FAN_NOFD)
+                sys_close(fd);
+out:
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        if (event->mask & FAN_ALL_PERM_EVENTS) {
+                event->response = FAN_DENY;
+                wake_up(&group->fanotify_data.access_waitq);
+        }
+#endif
        return ret;
 }
@@ -326,7 +349,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
                ret = -EAGAIN;
                if (file->f_flags & O_NONBLOCK)
                        break;
-                ret = -EINTR;
+                ret = -ERESTARTSYS;
                if (signal_pending(current))
                        break;
@@ -372,14 +395,13 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
        struct fsnotify_group *group = file->private_data;
-        struct fanotify_response_event *re, *lre;
-        pr_debug("%s: file=%p group=%p\n", __func__, file, group);
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        struct fanotify_response_event *re, *lre;
        mutex_lock(&group->fanotify_data.access_mutex);
-        group->fanotify_data.bypass_perm = true;
+        atomic_inc(&group->fanotify_data.bypass_perm);
        list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
                pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
@@ -433,6 +455,7 @@ static const struct file_operations fanotify_fops = {
        .release        = fanotify_release,
        .unlocked_ioctl = fanotify_ioctl,
        .compat_ioctl   = fanotify_ioctl,
+        .llseek         = noop_llseek,
 };
 static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
@@ -553,18 +576,24 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
                                       __u32 mask,
                                       unsigned int flags)
 {
-        __u32 oldmask;
+        __u32 oldmask = -1;
        spin_lock(&fsn_mark->lock);
        if (!(flags & FAN_MARK_IGNORED_MASK)) {
                oldmask = fsn_mark->mask;
                fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
        } else {
-                oldmask = fsn_mark->ignored_mask;
+                __u32 tmask = fsn_mark->ignored_mask | mask;
-                fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask));
+                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
                if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
                        fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
        }
+        if (!(flags & FAN_MARK_ONDIR)) {
+                __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
+                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+        }
        spin_unlock(&fsn_mark->lock);
        return mask & ~oldmask;
@@ -576,10 +605,12 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
+        int ret = 0;
        fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
        if (!fsn_mark) {
-                int ret;
+                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+                        return -ENOSPC;
                fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
                if (!fsn_mark)
@@ -587,17 +618,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
                ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
-                if (ret) {
+                if (ret)
-                        fanotify_free_mark(fsn_mark);
+                        goto err;
-                        return ret;
-                }
        }
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-        fsnotify_put_mark(fsn_mark);
        if (added & ~mnt->mnt_fsnotify_mask)
                fsnotify_recalc_vfsmount_mask(mnt);
+err:
-        return 0;
+        fsnotify_put_mark(fsn_mark);
+        return ret;
 }
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -606,12 +636,24 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
+        int ret = 0;
        pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
+        /*
+         * If some other task has this inode open for write we should not add
+         * an ignored mark, unless that ignored mark is supposed to survive
+         * modification changes anyway.
+         */
+        if ((flags & FAN_MARK_IGNORED_MASK) &&
+            !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+            (atomic_read(&inode->i_writecount) > 0))
+                return 0;
        fsn_mark = fsnotify_find_inode_mark(group, inode);
        if (!fsn_mark) {
-                int ret;
+                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+                        return -ENOSPC;
                fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
                if (!fsn_mark)
@@ -619,16 +661,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
                ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
-                if (ret) {
+                if (ret)
-                        fanotify_free_mark(fsn_mark);
+                        goto err;
-                        return ret;
-                }
        }
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-        fsnotify_put_mark(fsn_mark);
        if (added & ~inode->i_fsnotify_mask)
                fsnotify_recalc_inode_mask(inode);
-        return 0;
+err:
+        fsnotify_put_mark(fsn_mark);
+        return ret;
 }
 /* fanotify syscalls */
@@ -636,6 +678,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 {
        struct fsnotify_group *group;
        int f_flags, fd;
+        struct user_struct *user;
        pr_debug("%s: flags=%d event_f_flags=%d\n",
                __func__, flags, event_f_flags);
@@ -646,6 +689,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        if (flags & ~FAN_ALL_INIT_FLAGS)
                return -EINVAL;
+        user = get_current_user();
+        if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
+                free_uid(user);
+                return -EMFILE;
+        }
        f_flags = O_RDWR | FMODE_NONOTIFY;
        if (flags & FAN_CLOEXEC)
                f_flags |= O_CLOEXEC;
@@ -654,15 +703,53 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
        group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
-        if (IS_ERR(group))
+        if (IS_ERR(group)) {
+                free_uid(user);
                return PTR_ERR(group);
+        }
+        group->fanotify_data.user = user;
+        atomic_inc(&user->fanotify_listeners);
        group->fanotify_data.f_flags = event_f_flags;
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
        mutex_init(&group->fanotify_data.access_mutex);
        init_waitqueue_head(&group->fanotify_data.access_waitq);
        INIT_LIST_HEAD(&group->fanotify_data.access_list);
+        atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
+        switch (flags & FAN_ALL_CLASS_BITS) {
+        case FAN_CLASS_NOTIF:
+                group->priority = FS_PRIO_0;
+                break;
+        case FAN_CLASS_CONTENT:
+                group->priority = FS_PRIO_1;
+                break;
+        case FAN_CLASS_PRE_CONTENT:
+                group->priority = FS_PRIO_2;
+                break;
+        default:
+                fd = -EINVAL;
+                goto out_put_group;
+        }
+        if (flags & FAN_UNLIMITED_QUEUE) {
+                fd = -EPERM;
+                if (!capable(CAP_SYS_ADMIN))
+                        goto out_put_group;
+                group->max_events = UINT_MAX;
+        } else {
+                group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+        }
+        if (flags & FAN_UNLIMITED_MARKS) {
+                fd = -EPERM;
+                if (!capable(CAP_SYS_ADMIN))
+                        goto out_put_group;
+                group->fanotify_data.max_marks = UINT_MAX;
+        } else {
+                group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
+        }
        fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
        if (fd < 0)
@@ -696,13 +783,21 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
        if (flags & ~FAN_ALL_MARK_FLAGS)
                return -EINVAL;
        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
-        case FAN_MARK_ADD:
+        case FAN_MARK_ADD:              /* fallthrough */
        case FAN_MARK_REMOVE:
+                if (!mask)
+                        return -EINVAL;
        case FAN_MARK_FLUSH:
                break;
        default:
                return -EINVAL;
        }
+        if (mask & FAN_ONDIR) {
+                flags |= FAN_MARK_ONDIR;
+                mask &= ~FAN_ONDIR;
+        }
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
        if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
 #else
@@ -718,6 +813,16 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
        ret = -EINVAL;
        if (unlikely(filp->f_op != &fanotify_fops))
                goto fput_and_out;
+        group = filp->private_data;
+        /*
+         * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
+         * allowed to set permissions events.
+         */
+        ret = -EINVAL;
+        if (mask & FAN_ALL_PERM_EVENTS &&
+            group->priority == FS_PRIO_0)
+                goto fput_and_out;
        ret = fanotify_find_path(dfd, pathname, &path, flags);
        if (ret)
@@ -728,7 +833,6 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
                inode = path.dentry->d_inode;
        else
                mnt = path.mnt;
-        group = filp->private_data;
        /* create/update an inode mark */
        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 36802420d69a..79b47cbb5cd8 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -59,7 +59,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
        /* determine if the children should tell inode about their events */
        watched = fsnotify_inode_watches_children(inode);
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        /* run all of the dentries associated with this inode.  Since this is a
         * directory, there damn well better only be one item on this list */
        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
@@ -68,75 +68,57 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
                /* run all of the children of the original inode and fix their
                 * d_flags to indicate parental interest (their parent is the
                 * original inode) */
+                spin_lock(&alias->d_lock);
                list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
                        if (!child->d_inode)
                                continue;
-                        spin_lock(&child->d_lock);
+                        spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                        if (watched)
                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
                        else
                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
                        spin_unlock(&child->d_lock);
                }
+                spin_unlock(&alias->d_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
 /* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
        struct dentry *parent;
        struct inode *p_inode;
-        bool send = false;
+        int ret = 0;
-        bool should_update_children = false;
        if (!dentry)
                dentry = path->dentry;
        if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
-                return;
+                return 0;
-        spin_lock(&dentry->d_lock);
+        parent = dget_parent(dentry);
-        parent = dentry->d_parent;
        p_inode = parent->d_inode;
-        if (fsnotify_inode_watches_children(p_inode)) {
+        if (unlikely(!fsnotify_inode_watches_children(p_inode)))
-                if (p_inode->i_fsnotify_mask & mask) {
+                __fsnotify_update_child_dentry_flags(p_inode);
-                        dget(parent);
+        else if (p_inode->i_fsnotify_mask & mask) {
-                        send = true;
-                }
-        } else {
-                /*
-                 * The parent doesn't care about events on it's children but
-                 * at least one child thought it did.  We need to run all the
-                 * children and update their d_flags to let them know p_inode
-                 * doesn't care about them any more.
-                 */
-                dget(parent);
-                should_update_children = true;
-        }
-        spin_unlock(&dentry->d_lock);
-        if (send) {
                /* we are notifying a parent so come up with the new mask which
                 * specifies these are events which came from a child. */
                mask |= FS_EVENT_ON_CHILD;
                if (path)
-                        fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
+                        ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
-                                 dentry->d_name.name, 0);
+                                       dentry->d_name.name, 0);
                else
-                        fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+                        ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-                                 dentry->d_name.name, 0);
+                                       dentry->d_name.name, 0);
-                dput(parent);
        }
-        if (unlikely(should_update_children)) {
+        dput(parent);
-                __fsnotify_update_child_dentry_flags(p_inode);
-                dput(parent);
+        return ret;
-        }
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
@@ -275,20 +257,23 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                if (inode_group > vfsmount_group) {
                        /* handle inode */
-                        send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
+                        ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
-                                      data_is, cookie, file_name, &event);
+                                            data_is, cookie, file_name, &event);
                        /* we didn't use the vfsmount_mark */
                        vfsmount_group = NULL;
                } else if (vfsmount_group > inode_group) {
-                        send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
+                        ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
-                                      data_is, cookie, file_name, &event);
+                                            data_is, cookie, file_name, &event);
                        inode_group = NULL;
                } else {
-                        send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
+                        ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
-                                      mask, data, data_is, cookie, file_name,
+                                            mask, data, data_is, cookie, file_name,
-                                      &event);
+                                            &event);
                }
+                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
+                        goto out;
                if (inode_group)
                        inode_node = srcu_dereference(inode_node->next,
                                                      &fsnotify_mark_srcu);
@@ -296,7 +281,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                        vfsmount_node = srcu_dereference(vfsmount_node->next,
                                                         &fsnotify_mark_srcu);
        }
+        ret = 0;
+out:
        srcu_read_unlock(&fsnotify_mark_srcu, idx);
        /*
         * fsnotify_create_event() took a reference so the event can't be cleaned
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 33297c005060..4c29fcf557d1 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -177,7 +177,8 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
 * Attach an initialized mark to a given inode.
 * These marks may be used for the fsnotify backend to determine which
 * event types should be delivered to which group and for which inodes.  These
- * marks are ordered according to the group's location in memory.
+ * marks are ordered according to priority, highest number first, and then by
+ * the group's location in memory.
 */
 int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                            struct fsnotify_group *group, struct inode *inode,
@@ -211,7 +212,11 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                        goto out;
                }
-                if (mark->group < lmark->group)
+                if (mark->group->priority < lmark->group->priority)
+                        continue;
+                if ((mark->group->priority == lmark->group->priority) &&
+                    (mark->group < lmark->group))
                        continue;
                hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
@@ -240,6 +245,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 {
        struct inode *inode, *next_i, *need_iput = NULL;
+        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
                struct inode *need_iput_tmp;
@@ -297,4 +303,5 @@ void fsnotify_unmount_inodes(struct list_head *list)
                spin_lock(&inode_lock);
        }
+        spin_unlock(&inode_lock);
 }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index bf7f6d776c31..4cd5d5d78f9f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -344,6 +344,7 @@ static const struct file_operations inotify_fops = {
        .release        = inotify_release,
        .unlocked_ioctl = inotify_ioctl,
        .compat_ioctl   = inotify_ioctl,
+        .llseek         = noop_llseek,
 };
@@ -751,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
        if (ret >= 0)
                return ret;
+        fsnotify_put_group(group);
        atomic_dec(&user->inotify_devs);
 out_free_uid:
        free_uid(user);
@@ -861,7 +863,7 @@ static int __init inotify_user_setup(void)
        BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
        BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
        BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
-        BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+        BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
        BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
        BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 56772b578fbd..85eebff6d0d7 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -169,7 +169,11 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
                        goto out;
                }
-                if (mark->group < lmark->group)
+                if (mark->group->priority < lmark->group->priority)
+                        continue;
+                if ((mark->group->priority == lmark->group->priority) &&
+                    (mark->group < lmark->group))
                        continue;
                hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 58b6be992544..4ff028fcfd6e 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
             index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
             unistr.o upcase.o
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 113ebd9f25a4..f4b1057abdd2 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
 * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 *
 * This program/include file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published
@@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
 * single-segment behaviour.
 *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
- * when atomic and when not atomic.  This is ok because
+ * atomic and when not atomic.  This is ok because it calls
- * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
+ * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
- * and it is ok to call this when non-atomic.
+ * fact, the only difference between __copy_from_user_inatomic() and
- * Infact, the only difference between __copy_from_user_inatomic() and
 * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error.  And on many
+ * should not zero the tail of the buffer on error.  And on many architectures
- * architectures __copy_from_user_inatomic() is just defined to
+ * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
- * __copy_from_user() so it makes no difference at all on those architectures.
+ * makes no difference at all on those architectures.
 */
 static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
                unsigned nr_pages, unsigned ofs, const struct iovec **iov,
@@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
                if (unlikely(copied != len)) {
                        /* Do it the slow way. */
                        addr = kmap(*pages);
-                        copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
+                        copied = __ntfs_copy_from_user_iovec_inatomic(addr +
-                                        *iov, *iov_ofs, len);
+                                        ofs, *iov, *iov_ofs, len);
-                        /*
-                         * Zero the rest of the target like __copy_from_user().
-                         */
-                        memset(addr + ofs + copied, 0, len - copied);
-                        kunmap(*pages);
                        if (unlikely(copied != len))
                                goto err_out;
+                        kunmap(*pages);
                }
                total += len;
+                ntfs_set_next_iovec(iov, iov_ofs, len);
                bytes -= len;
                if (!bytes)
                        break;
-                ntfs_set_next_iovec(iov, iov_ofs, len);
                ofs = 0;
        } while (++pages < last_page);
 out:
        return total;
 err_out:
-        total += copied;
+        BUG_ON(copied > len);
        /* Zero the rest of the target like __copy_from_user(). */
+        memset(addr + ofs + copied, 0, len - copied);
+        kunmap(*pages);
+        total += copied;
+        ntfs_set_next_iovec(iov, iov_ofs, copied);
        while (++pages < last_page) {
                bytes -= len;
                if (!bytes)
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 93622b175fc7..a627ed82c0a3 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -332,6 +332,13 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
        return NULL;
 }
+static void ntfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
 void ntfs_destroy_big_inode(struct inode *inode)
 {
        ntfs_inode *ni = NTFS_I(inode);
@@ -340,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode)
        BUG_ON(ni->page);
        if (!atomic_dec_and_test(&ni->count))
                BUG();
-        kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+        call_rcu(&inode->i_rcu, ntfs_i_callback);
 }
 static inline ntfs_inode *ntfs_alloc_extent_inode(void)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b572b6727181..326e7475a22a 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
 /**
 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 * Copyright (c) 2002 Richard Russon
 *
 * This program/include file is free software; you can redistribute it and/or
@@ -2576,6 +2576,8 @@ mft_rec_already_initialized:
        flush_dcache_page(page);
        SetPageUptodate(page);
        if (base_ni) {
+                MFT_RECORD *m_tmp;
                /*
                 * Setup the base mft record in the extent mft record.  This
                 * completes initialization of the allocated extent mft record
@@ -2588,11 +2590,11 @@ mft_rec_already_initialized:
                 * attach it to the base inode @base_ni and map, pin, and lock
                 * its, i.e. the allocated, mft record.
                 */
-                m = map_extent_mft_record(base_ni, bit, &ni);
+                m_tmp = map_extent_mft_record(base_ni, bit, &ni);
-                if (IS_ERR(m)) {
+                if (IS_ERR(m_tmp)) {
                        ntfs_error(vol->sb, "Failed to map allocated extent "
                                        "mft record 0x%llx.", (long long)bit);
-                        err = PTR_ERR(m);
+                        err = PTR_ERR(m_tmp);
                        /* Set the mft record itself not in use. */
                        m->flags &= cpu_to_le16(
                                        ~le16_to_cpu(MFT_RECORD_IN_USE));
@@ -2603,6 +2605,7 @@ mft_rec_already_initialized:
                        ntfs_unmap_page(page);
                        goto undo_mftbmp_alloc;
                }
+                BUG_ON(m != m_tmp);
                /*
                 * Make sure the allocated mft record is written out to disk.
                 * No need to set the inode dirty because the caller is going
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 512806171bfa..29099a07b9fe 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
 /*
 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 * Copyright (c) 2001,2002 Richard Russon
 *
 * This program/include file is free software; you can redistribute it and/or
@@ -30,7 +30,6 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/moduleparam.h>
-#include <linux/smp_lock.h>
 #include <linux/bitmap.h>
 #include "sysctl.h"
@@ -445,7 +444,6 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        ntfs_debug("Entering with remount options string: %s", opt);
-        lock_kernel();
 #ifndef NTFS_RW
        /* For read-only compiled driver, enforce read-only flag. */
        *flags |= MS_RDONLY;
@@ -469,18 +467,15 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                if (NVolErrors(vol)) {
                        ntfs_error(sb, "Volume has errors and is read-only%s",
                                        es);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_IS_DIRTY) {
                        ntfs_error(sb, "Volume is dirty and read-only%s", es);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
                        ntfs_error(sb, "Volume has been modified by chkdsk "
                                        "and is read-only%s", es);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -488,13 +483,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                                        "(0x%x) and is read-only%s",
                                        (unsigned)le16_to_cpu(vol->vol_flags),
                                        es);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
                        ntfs_error(sb, "Failed to set dirty bit in volume "
                                        "information flags%s", es);
-                        unlock_kernel();
                        return -EROFS;
                }
 #if 0
@@ -514,21 +507,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                        ntfs_error(sb, "Failed to empty journal $LogFile%s",
                                        es);
                        NVolSetErrors(vol);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (!ntfs_mark_quotas_out_of_date(vol)) {
                        ntfs_error(sb, "Failed to mark quotas out of date%s",
                                        es);
                        NVolSetErrors(vol);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (!ntfs_stamp_usnjrnl(vol)) {
                        ntfs_error(sb, "Failed to stamp transation log "
                                        "($UsnJrnl)%s", es);
                        NVolSetErrors(vol);
-                        unlock_kernel();
                        return -EROFS;
                }
        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -544,11 +534,9 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        // TODO: Deal with *flags.
-        if (!parse_options(vol, opt)) {
+        if (!parse_options(vol, opt))
-                unlock_kernel();
                return -EINVAL;
-        }
-        unlock_kernel();
        ntfs_debug("Done.");
        return 0;
 }
@@ -2261,8 +2249,6 @@ static void ntfs_put_super(struct super_block *sb)
        ntfs_debug("Entering.");
-        lock_kernel();
 #ifdef NTFS_RW
        /*
         * Commit all inodes while they are still open in case some of them
@@ -2433,8 +2419,6 @@ static void ntfs_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
        kfree(vol);
-        unlock_kernel();
 }
 /**
@@ -2772,8 +2756,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
        init_rwsem(&vol->mftbmp_lock);
        init_rwsem(&vol->lcnbmp_lock);
-        unlock_kernel();
        /* By default, enable sparse support. */
        NVolSetSparseEnabled(vol);
@@ -2929,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                goto unl_upcase_iput_tmp_ino_err_out_now;
        }
        if ((sb->s_root = d_alloc_root(vol->root_ino))) {
-                /* We increment i_count simulating an ntfs_iget(). */
+                /* We grab a reference, simulating an ntfs_iget(). */
-                atomic_inc(&vol->root_ino->i_count);
+                ihold(vol->root_ino);
                ntfs_debug("Exiting, status successful.");
                /* Release the default upcase if it has no users. */
                mutex_lock(&ntfs_lock);
@@ -2940,7 +2922,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                }
                mutex_unlock(&ntfs_lock);
                sb->s_export_op = &ntfs_export_ops;
-                lock_kernel();
                lockdep_on();
                return 0;
        }
@@ -3040,24 +3021,8 @@ iput_tmp_ino_err_out_now:
        if (vol->mft_ino && vol->mft_ino != tmp_ino)
                iput(vol->mft_ino);
        vol->mft_ino = NULL;
-        /*
-         * This is needed to get ntfs_clear_extent_inode() called for each
-         * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
-         * leak resources and B) a subsequent mount fails automatically due to
-         * ntfs_iget() never calling down into our ntfs_read_locked_inode()
-         * method again... FIXME: Do we need to do this twice now because of
-         * attribute inodes? I think not, so leave as is for now... (AIA)
-         */
-        if (invalidate_inodes(sb)) {
-                ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
-                                "driver bug.");
-                /* Copied from fs/super.c. I just love this message. (-; */
-                printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
-                                "seconds.  Have a nice day...\n");
-        }
        /* Errors at this stage are irrelevant. */
 err_out_now:
-        lock_kernel();
        sb->s_fs_info = NULL;
        kfree(vol);
        ntfs_debug("Failed, returning -EINVAL.");
@@ -3094,17 +3059,16 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
-static int ntfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *ntfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
-                           mnt);
 }
 static struct file_system_type ntfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ntfs",
-        .get_sb         = ntfs_get_sb,
+        .mount          = ntfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -3229,8 +3193,8 @@ static void __exit exit_ntfs_fs(void)
        ntfs_sysctl(0);
 }
-MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>");
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d840669698e..77a8de5f7119 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
 config OCFS2_FS
        tristate "OCFS2 file system support"
-        depends on NET && SYSFS
+        depends on NET && SYSFS && CONFIGFS_FS
-        select CONFIGFS_FS
        select JBD2
        select CRC32
        select QUOTA
@@ -51,7 +50,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
 config OCFS2_FS_STATS
        bool "OCFS2 statistics"
-        depends on OCFS2_FS
+        depends on OCFS2_FS && DEBUG_FS
        default y
        help
          This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 391915093fe1..704f6b1742f3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -291,13 +291,17 @@ static int ocfs2_set_acl(handle_t *handle,
        return ret;
 }
-int ocfs2_check_acl(struct inode *inode, int mask)
+int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb;
        struct buffer_head *di_bh = NULL;
        struct posix_acl *acl;
        int ret = -EAGAIN;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        osb = OCFS2_SB(inode->i_sb);
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return ret;
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 5c5d31f05853..4fe7c9cf4bfb 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
        __le32 e_id;
 };
-extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_check_acl(struct inode *, int, unsigned int);
 extern int ocfs2_acl_chmod(struct inode *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
                          struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 592fae5007d1..e4984e259cb6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
        return ret;
 }
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                         struct ocfs2_extent_block *eb);
 static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, tl_bh);
+        osb->truncated_clusters += num_clusters;
 bail:
        mlog_exit(status);
        return status;
@@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                i--;
        }
+        osb->truncated_clusters = 0;
 bail:
        mlog_exit(status);
        return status;
@@ -7139,64 +7141,6 @@ bail:
 }
 /*
- * Expects the inode to already be locked.
- */
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-                           struct inode *inode,
-                           struct buffer_head *fe_bh,
-                           struct ocfs2_truncate_context **tc)
-{
-        int status;
-        unsigned int new_i_clusters;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
-        struct buffer_head *last_eb_bh = NULL;
-        mlog_entry_void();
-        *tc = NULL;
-        new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
-                                                  i_size_read(inode));
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
-             "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
-             (unsigned long long)le64_to_cpu(fe->i_size));
-        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
-        if (!(*tc)) {
-                status = -ENOMEM;
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_read_extent_block(INODE_CACHE(inode),
-                                                 le64_to_cpu(fe->i_last_eb_blk),
-                                                 &last_eb_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        }
-        (*tc)->tc_last_eb_bh = last_eb_bh;
-        status = 0;
-bail:
-        if (status < 0) {
-                if (*tc)
-                        ocfs2_free_truncate_context(*tc);
-                *tc = NULL;
-        }
-        mlog_exit_void();
-        return status;
-}
-/*
 * 'start' is inclusive, 'end' is not.
 */
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
@@ -7270,18 +7214,3 @@ out_commit:
 out:
        return ret;
 }
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
-{
-        /*
-         * The caller is responsible for completing deallocation
-         * before freeing the context.
-         */
-        if (tc->tc_dealloc.c_first_suballocator != NULL)
-                mlog(ML_NOTICE,
-                     "Truncate completion has non-empty dealloc context\n");
-        brelse(tc->tc_last_eb_bh);
-        kfree(tc);
-}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 55762b554b99..3bd08a03251c 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -228,10 +228,6 @@ struct ocfs2_truncate_context {
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
                                  u64 range_start, u64 range_end);
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-                           struct inode *inode,
-                           struct buffer_head *fe_bh,
-                           struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
                          struct buffer_head *di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0de69c9a08be..1fbb0e20131b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
         * ocfs2 never allocates in this function - the only time we
         * need to use BH_New is when we're extending i_size on a file
         * system which doesn't support holes, in which case BH_New
-         * allows block_prepare_write() to zero.
+         * allows __block_write_begin() to zero.
         *
         * If we see this on a sparse file system, then a truncate has
         * raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
        return ret;
 }
-/*
- * This is called from ocfs2_write_zero_page() which has handled it's
- * own cluster locking and has ensured allocation exists for those
- * blocks to be written.
- */
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-                               unsigned from, unsigned to)
-{
-        int ret;
-        ret = block_prepare_write(page, from, to, ocfs2_get_block);
-        return ret;
-}
 /* Taken from ext3. We don't necessarily need the full blown
 * functionality yet, but IMHO it's better to cut and paste the whole
 * thing so we can avoid introducing our own bugs (and easily pick up
@@ -588,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+        if (ocfs2_iocb_is_sem_locked(iocb)) {
+                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        ocfs2_iocb_clear_rw_locked(iocb);
        level = ocfs2_iocb_rw_locked_level(iocb);
-        if (!level)
-                up_read(&inode->i_alloc_sem);
        ocfs2_rw_unlock(inode, level);
        if (is_async)
@@ -732,7 +720,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
 }
 /*
- * Some of this taken from block_prepare_write(). We already have our
+ * Some of this taken from __block_write_begin(). We already have our
 * mapping by now though, and the entire write will be allocating or
 * it won't, so not much need to use BH_New.
 *
@@ -883,8 +871,8 @@ struct ocfs2_write_ctxt {
         * out in so that future reads from that region will get
         * zero's.
         */
-        struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
        unsigned int                    w_num_pages;
+        struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
        struct page                     *w_target_page;
        /*
@@ -1642,13 +1630,51 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
        return ret;
 }
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+                                          unsigned int needed)
+{
+        tid_t target;
+        int ret = 0;
+        unsigned int truncated_clusters;
+        mutex_lock(&osb->osb_tl_inode->i_mutex);
+        truncated_clusters = osb->truncated_clusters;
+        mutex_unlock(&osb->osb_tl_inode->i_mutex);
+        /*
+         * Check whether we can succeed in allocating if we free
+         * the truncate log.
+         */
+        if (truncated_clusters < needed)
+                goto out;
+        ret = ocfs2_flush_truncate_log(osb);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+                jbd2_log_wait_commit(osb->journal->j_journal, target);
+                ret = 1;
+        }
+out:
+        return ret;
+}
+int ocfs2_write_begin_nolock(struct file *filp,
+                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
        int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
-        unsigned int clusters_to_alloc, extents_to_split;
+        unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
        struct ocfs2_write_ctxt *wc;
        struct inode *inode = mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1657,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
        struct ocfs2_alloc_context *meta_ac = NULL;
        handle_t *handle;
        struct ocfs2_extent_tree et;
+        int try_free = 1, ret1;
+try_again:
        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
        if (ret) {
                mlog_errno(ret);
@@ -1692,7 +1720,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                mlog_errno(ret);
                goto out;
        } else if (ret == 1) {
-                ret = ocfs2_refcount_cow(inode, di_bh,
+                clusters_need = wc->w_clen;
+                ret = ocfs2_refcount_cow(inode, filp, di_bh,
                                         wc->w_cpos, wc->w_clen, UINT_MAX);
                if (ret) {
                        mlog_errno(ret);
@@ -1706,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                mlog_errno(ret);
                goto out;
        }
+        clusters_need += clusters_to_alloc;
        di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
@@ -1828,6 +1858,22 @@ out:
                ocfs2_free_alloc_context(data_ac);
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        if (ret == -ENOSPC && try_free) {
+                /*
+                 * Try to free some truncate log so that we can have enough
+                 * clusters to allocate.
+                 */
+                try_free = 0;
+                ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+                if (ret1 == 1)
+                        goto try_again;
+                if (ret1 < 0)
+                        mlog_errno(ret1);
+        }
        return ret;
 }
@@ -1854,7 +1900,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+        ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
                                       fsdata, di_bh, NULL);
        if (ret) {
                mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93ffc513..eceb456037c1 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-                               unsigned from, unsigned to);
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         struct page *page,
                                                         unsigned from,
@@ -48,7 +45,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                           loff_t pos, unsigned len, unsigned copied,
                           struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page);
@@ -70,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
        else
                clear_bit(1, (unsigned long *)&iocb->private);
 }
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication bewteen
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+        OCFS2_IOCB_RW_LOCK = 0,
+        OCFS2_IOCB_RW_LOCK_LEVEL,
+        OCFS2_IOCB_SEM,
+        OCFS2_IOCB_NUM_LOCKS
+};
 #define ocfs2_iocb_clear_rw_locked(iocb) \
-        clear_bit(0, (unsigned long *)&iocb->private)
+        clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_rw_locked_level(iocb) \
-        test_bit(1, (unsigned long *)&iocb->private)
+        test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+        set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+        clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+        test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 41d5f1f92d56..b108e863d8f6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,10 +62,53 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
 static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+/*
+ * In global heartbeat, we maintain a series of region bitmaps.
+ *      - o2hb_region_bitmap allows us to limit the region number to max region.
+ *      - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
+ *      - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
+ *              heartbeat on it.
+ *      - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
+ */
+static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+#define O2HB_DB_TYPE_LIVENODES          0
+#define O2HB_DB_TYPE_LIVEREGIONS        1
+#define O2HB_DB_TYPE_QUORUMREGIONS      2
+#define O2HB_DB_TYPE_FAILEDREGIONS      3
+#define O2HB_DB_TYPE_REGION_LIVENODES   4
+#define O2HB_DB_TYPE_REGION_NUMBER      5
+#define O2HB_DB_TYPE_REGION_ELAPSED_TIME        6
+#define O2HB_DB_TYPE_REGION_PINNED      7
+struct o2hb_debug_buf {
+        int db_type;
+        int db_size;
+        int db_len;
+        void *db_data;
+};
+static struct o2hb_debug_buf *o2hb_db_livenodes;
+static struct o2hb_debug_buf *o2hb_db_liveregions;
+static struct o2hb_debug_buf *o2hb_db_quorumregions;
+static struct o2hb_debug_buf *o2hb_db_failedregions;
 #define O2HB_DEBUG_DIR                  "o2hb"
 #define O2HB_DEBUG_LIVENODES            "livenodes"
+#define O2HB_DEBUG_LIVEREGIONS          "live_regions"
+#define O2HB_DEBUG_QUORUMREGIONS        "quorum_regions"
+#define O2HB_DEBUG_FAILEDREGIONS        "failed_regions"
+#define O2HB_DEBUG_REGION_NUMBER        "num"
+#define O2HB_DEBUG_REGION_ELAPSED_TIME  "elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED        "pinned"
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
+static struct dentry *o2hb_debug_liveregions;
+static struct dentry *o2hb_debug_quorumregions;
+static struct dentry *o2hb_debug_failedregions;
 static LIST_HEAD(o2hb_all_regions);
@@ -77,7 +120,46 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
 #define O2HB_DEFAULT_BLOCK_BITS       9
+enum o2hb_heartbeat_modes {
+        O2HB_HEARTBEAT_LOCAL            = 0,
+        O2HB_HEARTBEAT_GLOBAL,
+        O2HB_HEARTBEAT_NUM_MODES,
+};
+char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+                "local",        /* O2HB_HEARTBEAT_LOCAL */
+                "global",       /* O2HB_HEARTBEAT_GLOBAL */
+};
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF                3
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
 /* Only sets a new threshold if there are no active regions.
 *
@@ -94,6 +176,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
        }
 }
+static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+{
+        int ret = -1;
+        if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
+                spin_lock(&o2hb_live_lock);
+                if (list_empty(&o2hb_all_regions)) {
+                        o2hb_heartbeat_mode = hb_mode;
+                        ret = 0;
+                }
+                spin_unlock(&o2hb_live_lock);
+        }
+        return ret;
+}
 struct o2hb_node_event {
        struct list_head        hn_item;
        enum o2hb_callback_type hn_event_type;
@@ -117,7 +215,9 @@ struct o2hb_region {
        struct config_item      hr_item;
        struct list_head        hr_all_item;
-        unsigned                hr_unclean_stop:1;
+        unsigned                hr_unclean_stop:1,
+                                hr_item_pinned:1,
+                                hr_item_dropped:1;
        /* protected by the hr_callback_sem */
        struct task_struct      *hr_task;
@@ -135,6 +235,20 @@ struct o2hb_region {
        struct block_device     *hr_bdev;
        struct o2hb_disk_slot   *hr_slots;
+        /* live node map of this region */
+        unsigned long           hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned int            hr_region_num;
+        struct dentry           *hr_debug_dir;
+        struct dentry           *hr_debug_livenodes;
+        struct dentry           *hr_debug_regnum;
+        struct dentry           *hr_debug_elapsed_time;
+        struct dentry           *hr_debug_pinned;
+        struct o2hb_debug_buf   *hr_db_livenodes;
+        struct o2hb_debug_buf   *hr_db_regnum;
+        struct o2hb_debug_buf   *hr_db_elapsed_time;
+        struct o2hb_debug_buf   *hr_db_pinned;
        /* let the person setting up hb wait for it to return until it
         * has reached a 'steady' state.  This will be fixed when we have
         * a more complete api that doesn't lead to this sort of fragility. */
@@ -163,8 +277,19 @@ struct o2hb_bio_wait_ctxt {
        int               wc_error;
 };
+static int o2hb_pop_count(void *map, int count)
+{
+        int i = -1, pop = 0;
+        while ((i = find_next_bit(map, count, i + 1)) < count)
+                pop++;
+        return pop;
+}
 static void o2hb_write_timeout(struct work_struct *work)
 {
+        int failed, quorum;
+        unsigned long flags;
        struct o2hb_region *reg =
                container_of(work, struct o2hb_region,
                             hr_write_timeout_work.work);
@@ -172,6 +297,28 @@ static void o2hb_write_timeout(struct work_struct *work)
        mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
             "milliseconds\n", reg->hr_dev_name,
             jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+        if (o2hb_global_heartbeat_active()) {
+                spin_lock_irqsave(&o2hb_live_lock, flags);
+                if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                        set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+                failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+                                        O2NM_MAX_REGIONS);
+                quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                                        O2NM_MAX_REGIONS);
+                spin_unlock_irqrestore(&o2hb_live_lock, flags);
+                mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
+                     quorum, failed);
+                /*
+                 * Fence if the number of failed regions >= half the number
+                 * of  quorum regions
+                 */
+                if ((failed << 1) < quorum)
+                        return;
+        }
        o2quo_disk_timeout();
 }
@@ -180,6 +327,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
        mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
             O2HB_MAX_WRITE_TIMEOUT_MS);
+        if (o2hb_global_heartbeat_active()) {
+                spin_lock(&o2hb_live_lock);
+                clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+                spin_unlock(&o2hb_live_lock);
+        }
        cancel_delayed_work(&reg->hr_write_timeout_work);
        reg->hr_last_timeout_start = jiffies;
        schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -188,8 +340,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
 {
-        cancel_delayed_work(&reg->hr_write_timeout_work);
+        cancel_delayed_work_sync(&reg->hr_write_timeout_work);
-        flush_scheduled_work();
 }
 static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -513,6 +664,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
 {
        assert_spin_locked(&o2hb_live_lock);
+        BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
        event->hn_event_type = type;
        event->hn_node = node;
        event->hn_node_num = node_num;
@@ -554,6 +707,43 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
        o2nm_node_put(node);
 }
+static void o2hb_set_quorum_device(struct o2hb_region *reg,
+                                   struct o2hb_disk_slot *slot)
+{
+        assert_spin_locked(&o2hb_live_lock);
+        if (!o2hb_global_heartbeat_active())
+                return;
+        if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                return;
+        /*
+         * A region can be added to the quorum only when it sees all
+         * live nodes heartbeat on it. In other words, the region has been
+         * added to all nodes.
+         */
+        if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+                   sizeof(o2hb_live_node_bitmap)))
+                return;
+        if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
+                return;
+        printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
+               config_item_name(&reg->hr_item));
+        set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+        /*
+         * If global heartbeat active, unpin all regions if the
+         * region count > CUT_OFF
+         */
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+                o2hb_region_unpin(NULL);
+}
 static int o2hb_check_slot(struct o2hb_region *reg,
                           struct o2hb_disk_slot *slot)
 {
@@ -565,14 +755,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
        u64 cputime;
        unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
        unsigned int slot_dead_ms;
+        int tmp;
        memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
-        /* Is this correct? Do we assume that the node doesn't exist
+        /*
-         * if we're not configured for him? */
+         * If a node is no longer configured but is still in the livemap, we
+         * may need to clear that bit from the livemap.
+         */
        node = o2nm_get_node_by_num(slot->ds_node_num);
-        if (!node)
+        if (!node) {
-                return 0;
+                spin_lock(&o2hb_live_lock);
+                tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+                spin_unlock(&o2hb_live_lock);
+                if (!tmp)
+                        return 0;
+        }
        if (!o2hb_verify_crc(reg, hb_block)) {
                /* all paths from here will drop o2hb_live_lock for
@@ -639,8 +837,12 @@ fire_callbacks:
                mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
                     slot->ds_node_num, (long long)slot->ds_last_generation);
+                set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
                /* first on the list generates a callback */
                if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+                        mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
+                             "bitmap\n", slot->ds_node_num);
                        set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
                        o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
@@ -684,13 +886,18 @@ fire_callbacks:
                mlog(ML_HEARTBEAT, "Node %d left my region\n",
                     slot->ds_node_num);
+                clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
                /* last off the live_slot generates a callback */
                list_del_init(&slot->ds_live_item);
                if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+                        mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
+                             "nodes bitmap\n", slot->ds_node_num);
                        clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
-                        o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+                        /* node can be null */
-                                              slot->ds_node_num);
+                        o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
+                                              node, slot->ds_node_num);
                        changed = 1;
                }
@@ -706,11 +913,14 @@ fire_callbacks:
                slot->ds_equal_samples = 0;
        }
 out:
+        o2hb_set_quorum_device(reg, slot);
        spin_unlock(&o2hb_live_lock);
        o2hb_run_event_list(&event);
-        o2nm_node_put(node);
+        if (node)
+                o2nm_node_put(node);
        return changed;
 }
@@ -737,6 +947,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 {
        int i, ret, highest_node, change = 0;
        unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
        struct o2hb_bio_wait_ctxt write_wc;
        ret = o2nm_configured_node_map(configured_nodes,
@@ -746,6 +957,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
                return ret;
        }
+        /*
+         * If a node is not configured but is in the livemap, we still need
+         * to read the slot so as to be able to remove it from the livemap.
+         */
+        o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+        i = -1;
+        while ((i = find_next_bit(live_node_bitmap,
+                                  O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+                set_bit(i, configured_nodes);
+        }
        highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
        if (highest_node >= O2NM_MAX_NODES) {
                mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
@@ -860,6 +1082,9 @@ static int o2hb_thread(void *data)
        set_user_nice(current, -20);
+        /* Pin node */
+        o2nm_depend_this_node();
        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
                /* We track the time spent inside
                 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -909,6 +1134,9 @@ static int o2hb_thread(void *data)
                mlog_errno(ret);
        }
+        /* Unpin node */
+        o2nm_undepend_this_node();
        mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
        return 0;
@@ -917,21 +1145,65 @@ static int o2hb_thread(void *data)
 #ifdef CONFIG_DEBUG_FS
 static int o2hb_debug_open(struct inode *inode, struct file *file)
 {
+        struct o2hb_debug_buf *db = inode->i_private;
+        struct o2hb_region *reg;
        unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        char *buf = NULL;
        int i = -1;
        int out = 0;
+        /* max_nodes should be the largest bitmap we pass here */
+        BUG_ON(sizeof(map) < db->db_size);
        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!buf)
                goto bail;
-        o2hb_fill_node_map(map, sizeof(map));
+        switch (db->db_type) {
+        case O2HB_DB_TYPE_LIVENODES:
+        case O2HB_DB_TYPE_LIVEREGIONS:
+        case O2HB_DB_TYPE_QUORUMREGIONS:
+        case O2HB_DB_TYPE_FAILEDREGIONS:
+                spin_lock(&o2hb_live_lock);
+                memcpy(map, db->db_data, db->db_size);
+                spin_unlock(&o2hb_live_lock);
+                break;
-        while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+        case O2HB_DB_TYPE_REGION_LIVENODES:
+                spin_lock(&o2hb_live_lock);
+                reg = (struct o2hb_region *)db->db_data;
+                memcpy(map, reg->hr_live_node_bitmap, db->db_size);
+                spin_unlock(&o2hb_live_lock);
+                break;
+        case O2HB_DB_TYPE_REGION_NUMBER:
+                reg = (struct o2hb_region *)db->db_data;
+                out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+                                reg->hr_region_num);
+                goto done;
+        case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
+                reg = (struct o2hb_region *)db->db_data;
+                out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+                                jiffies_to_msecs(jiffies -
+                                                 reg->hr_last_timeout_start));
+                goto done;
+        case O2HB_DB_TYPE_REGION_PINNED:
+                reg = (struct o2hb_region *)db->db_data;
+                out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+                                !!reg->hr_item_pinned);
+                goto done;
+        default:
+                goto done;
+        }
+        while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
                out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
        out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+done:
        i_size_write(inode, out);
        file->private_data = buf;
@@ -978,10 +1250,104 @@ static const struct file_operations o2hb_debug_fops = {
 void o2hb_exit(void)
 {
-        if (o2hb_debug_livenodes)
+        kfree(o2hb_db_livenodes);
-                debugfs_remove(o2hb_debug_livenodes);
+        kfree(o2hb_db_liveregions);
-        if (o2hb_debug_dir)
+        kfree(o2hb_db_quorumregions);
-                debugfs_remove(o2hb_debug_dir);
+        kfree(o2hb_db_failedregions);
+        debugfs_remove(o2hb_debug_failedregions);
+        debugfs_remove(o2hb_debug_quorumregions);
+        debugfs_remove(o2hb_debug_liveregions);
+        debugfs_remove(o2hb_debug_livenodes);
+        debugfs_remove(o2hb_debug_dir);
+}
+static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
+                                        struct o2hb_debug_buf **db, int db_len,
+                                        int type, int size, int len, void *data)
+{
+        *db = kmalloc(db_len, GFP_KERNEL);
+        if (!*db)
+                return NULL;
+        (*db)->db_type = type;
+        (*db)->db_size = size;
+        (*db)->db_len = len;
+        (*db)->db_data = data;
+        return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
+                                   &o2hb_debug_fops);
+}
+static int o2hb_debug_init(void)
+{
+        int ret = -ENOMEM;
+        o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+        if (!o2hb_debug_dir) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+                                                 o2hb_debug_dir,
+                                                 &o2hb_db_livenodes,
+                                                 sizeof(*o2hb_db_livenodes),
+                                                 O2HB_DB_TYPE_LIVENODES,
+                                                 sizeof(o2hb_live_node_bitmap),
+                                                 O2NM_MAX_NODES,
+                                                 o2hb_live_node_bitmap);
+        if (!o2hb_debug_livenodes) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
+                                                   o2hb_debug_dir,
+                                                   &o2hb_db_liveregions,
+                                                   sizeof(*o2hb_db_liveregions),
+                                                   O2HB_DB_TYPE_LIVEREGIONS,
+                                                   sizeof(o2hb_live_region_bitmap),
+                                                   O2NM_MAX_REGIONS,
+                                                   o2hb_live_region_bitmap);
+        if (!o2hb_debug_liveregions) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        o2hb_debug_quorumregions =
+                        o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
+                                          o2hb_debug_dir,
+                                          &o2hb_db_quorumregions,
+                                          sizeof(*o2hb_db_quorumregions),
+                                          O2HB_DB_TYPE_QUORUMREGIONS,
+                                          sizeof(o2hb_quorum_region_bitmap),
+                                          O2NM_MAX_REGIONS,
+                                          o2hb_quorum_region_bitmap);
+        if (!o2hb_debug_quorumregions) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        o2hb_debug_failedregions =
+                        o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
+                                          o2hb_debug_dir,
+                                          &o2hb_db_failedregions,
+                                          sizeof(*o2hb_db_failedregions),
+                                          O2HB_DB_TYPE_FAILEDREGIONS,
+                                          sizeof(o2hb_failed_region_bitmap),
+                                          O2NM_MAX_REGIONS,
+                                          o2hb_failed_region_bitmap);
+        if (!o2hb_debug_failedregions) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        ret = 0;
+bail:
+        if (ret)
+                o2hb_exit();
+        return ret;
 }
 int o2hb_init(void)
@@ -997,24 +1363,14 @@ int o2hb_init(void)
        INIT_LIST_HEAD(&o2hb_node_events);
        memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+        memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
+        memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
+        memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
+        memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
-        o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+        o2hb_dependent_users = 0;
-        if (!o2hb_debug_dir) {
-                mlog_errno(-ENOMEM);
-                return -ENOMEM;
-        }
-        o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
+        return o2hb_debug_init();
-                                                   S_IFREG|S_IRUSR,
-                                                   o2hb_debug_dir, NULL,
-                                                   &o2hb_debug_fops);
-        if (!o2hb_debug_livenodes) {
-                mlog_errno(-ENOMEM);
-                debugfs_remove(o2hb_debug_dir);
-                return -ENOMEM;
-        }
-        return 0;
 }
 /* if we're already in a callback then we're already serialized by the sem */
@@ -1078,6 +1434,14 @@ static void o2hb_region_release(struct config_item *item)
        if (reg->hr_slots)
                kfree(reg->hr_slots);
+        kfree(reg->hr_db_regnum);
+        kfree(reg->hr_db_livenodes);
+        debugfs_remove(reg->hr_debug_livenodes);
+        debugfs_remove(reg->hr_debug_regnum);
+        debugfs_remove(reg->hr_debug_elapsed_time);
+        debugfs_remove(reg->hr_debug_pinned);
+        debugfs_remove(reg->hr_debug_dir);
        spin_lock(&o2hb_live_lock);
        list_del(&reg->hr_all_item);
        spin_unlock(&o2hb_live_lock);
@@ -1365,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                goto out;
        reg->hr_bdev = I_BDEV(filp->f_mapping->host);
-        ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
+        ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
        if (ret) {
                reg->hr_bdev = NULL;
                goto out;
@@ -1441,6 +1805,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
        /* Ok, we were woken.  Make sure it wasn't by drop_item() */
        spin_lock(&o2hb_live_lock);
        hb_task = reg->hr_task;
+        if (o2hb_global_heartbeat_active())
+                set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
        spin_unlock(&o2hb_live_lock);
        if (hb_task)
@@ -1448,6 +1814,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
        else
                ret = -EIO;
+        if (hb_task && o2hb_global_heartbeat_active())
+                printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
+                       config_item_name(&reg->hr_item));
 out:
        if (filp)
                fput(filp);
@@ -1586,22 +1956,113 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
                : NULL;
 }
+static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+{
+        int ret = -ENOMEM;
+        reg->hr_debug_dir =
+                debugfs_create_dir(config_item_name(&reg->hr_item), dir);
+        if (!reg->hr_debug_dir) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        reg->hr_debug_livenodes =
+                        o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+                                          reg->hr_debug_dir,
+                                          &(reg->hr_db_livenodes),
+                                          sizeof(*(reg->hr_db_livenodes)),
+                                          O2HB_DB_TYPE_REGION_LIVENODES,
+                                          sizeof(reg->hr_live_node_bitmap),
+                                          O2NM_MAX_NODES, reg);
+        if (!reg->hr_debug_livenodes) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        reg->hr_debug_regnum =
+                        o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
+                                          reg->hr_debug_dir,
+                                          &(reg->hr_db_regnum),
+                                          sizeof(*(reg->hr_db_regnum)),
+                                          O2HB_DB_TYPE_REGION_NUMBER,
+                                          0, O2NM_MAX_NODES, reg);
+        if (!reg->hr_debug_regnum) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        reg->hr_debug_elapsed_time =
+                        o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
+                                          reg->hr_debug_dir,
+                                          &(reg->hr_db_elapsed_time),
+                                          sizeof(*(reg->hr_db_elapsed_time)),
+                                          O2HB_DB_TYPE_REGION_ELAPSED_TIME,
+                                          0, 0, reg);
+        if (!reg->hr_debug_elapsed_time) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        reg->hr_debug_pinned =
+                        o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+                                          reg->hr_debug_dir,
+                                          &(reg->hr_db_pinned),
+                                          sizeof(*(reg->hr_db_pinned)),
+                                          O2HB_DB_TYPE_REGION_PINNED,
+                                          0, 0, reg);
+        if (!reg->hr_debug_pinned) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        ret = 0;
+bail:
+        return ret;
+}
 static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
                                                          const char *name)
 {
        struct o2hb_region *reg = NULL;
+        int ret;
        reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
        if (reg == NULL)
                return ERR_PTR(-ENOMEM);
-        config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+        if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
+                ret = -ENAMETOOLONG;
+                goto free;
+        }
        spin_lock(&o2hb_live_lock);
+        reg->hr_region_num = 0;
+        if (o2hb_global_heartbeat_active()) {
+                reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
+                                                         O2NM_MAX_REGIONS);
+                if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
+                        spin_unlock(&o2hb_live_lock);
+                        ret = -EFBIG;
+                        goto free;
+                }
+                set_bit(reg->hr_region_num, o2hb_region_bitmap);
+        }
        list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
        spin_unlock(&o2hb_live_lock);
+        config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+        ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
+        if (ret) {
+                config_item_put(&reg->hr_item);
+                goto free;
+        }
        return &reg->hr_item;
+free:
+        kfree(reg);
+        return ERR_PTR(ret);
 }
 static void o2hb_heartbeat_group_drop_item(struct config_group *group,
@@ -1609,11 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 {
        struct task_struct *hb_task;
        struct o2hb_region *reg = to_o2hb_region(item);
+        int quorum_region = 0;
        /* stop the thread when the user removes the region dir */
        spin_lock(&o2hb_live_lock);
+        if (o2hb_global_heartbeat_active()) {
+                clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+                clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+                if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                        quorum_region = 1;
+                clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+        }
        hb_task = reg->hr_task;
        reg->hr_task = NULL;
+        reg->hr_item_dropped = 1;
        spin_unlock(&o2hb_live_lock);
        if (hb_task)
@@ -1628,7 +2098,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
                wake_up(&o2hb_steady_queue);
        }
+        if (o2hb_global_heartbeat_active())
+                printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
+                       config_item_name(&reg->hr_item));
        config_item_put(item);
+        if (!o2hb_global_heartbeat_active() || !quorum_region)
+                return;
+        /*
+         * If global heartbeat active and there are dependent users,
+         * pin all regions if quorum region count <= CUT_OFF
+         */
+        spin_lock(&o2hb_live_lock);
+        if (!o2hb_dependent_users)
+                goto unlock;
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+                o2hb_region_pin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
 }
 struct o2hb_heartbeat_group_attribute {
@@ -1688,6 +2181,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
        return count;
 }
+static
+ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
+                                       char *page)
+{
+        return sprintf(page, "%s\n",
+                       o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
+}
+static
+ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
+                                        const char *page, size_t count)
+{
+        unsigned int i;
+        int ret;
+        size_t len;
+        len = (page[count - 1] == '\n') ? count - 1 : count;
+        if (!len)
+                return -EINVAL;
+        for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
+                if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+                        continue;
+                ret = o2hb_global_hearbeat_mode_set(i);
+                if (!ret)
+                        printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
+                               o2hb_heartbeat_mode_desc[i]);
+                return count;
+        }
+        return -EINVAL;
+}
 static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
        .attr   = { .ca_owner = THIS_MODULE,
                    .ca_name = "dead_threshold",
@@ -1696,8 +2224,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
        .store  = o2hb_heartbeat_group_threshold_store,
 };
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                .ca_name = "mode",
+                .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = o2hb_heartbeat_group_mode_show,
+        .store  = o2hb_heartbeat_group_mode_store,
+};
 static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
        &o2hb_heartbeat_group_attr_threshold.attr,
+        &o2hb_heartbeat_group_attr_mode.attr,
        NULL,
 };
@@ -1770,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
 {
-        struct o2hb_region *p, *reg = NULL;
+        int ret = 0, found = 0;
+        struct o2hb_region *reg;
+        char *uuid;
        assert_spin_locked(&o2hb_live_lock);
-        list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-                if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
+                uuid = config_item_name(&reg->hr_item);
-                        reg = p;
-                        break;
+                /* local heartbeat */
+                if (region_uuid) {
+                        if (strcmp(region_uuid, uuid))
+                                continue;
+                        found = 1;
+                }
+                if (reg->hr_item_pinned || reg->hr_item_dropped)
+                        goto skip_pin;
+                /* Ignore ENOENT only for local hb (userdlm domain) */
+                ret = o2nm_depend_item(&reg->hr_item);
+                if (!ret) {
+                        mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+                        reg->hr_item_pinned = 1;
+                } else {
+                        if (ret == -ENOENT && found)
+                                ret = 0;
+                        else {
+                                mlog(ML_ERROR, "Pin region %s fails with %d\n",
+                                     uuid, ret);
+                                break;
+                        }
                }
+skip_pin:
+                if (found)
+                        break;
        }
-        return reg;
+        return ret;
 }
-static int o2hb_region_get(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
 {
-        int ret = 0;
        struct o2hb_region *reg;
+        char *uuid;
+        int found = 0;
-        spin_lock(&o2hb_live_lock);
+        assert_spin_locked(&o2hb_live_lock);
-        reg = o2hb_find_region(region_uuid);
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-        if (!reg)
+                uuid = config_item_name(&reg->hr_item);
-                ret = -ENOENT;
+                if (region_uuid) {
-        spin_unlock(&o2hb_live_lock);
+                        if (strcmp(region_uuid, uuid))
+                                continue;
+                        found = 1;
+                }
-        if (ret)
+                if (reg->hr_item_pinned) {
-                goto out;
+                        mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+                        o2nm_undepend_item(&reg->hr_item);
+                        reg->hr_item_pinned = 0;
+                }
+                if (found)
+                        break;
+        }
+}
-        ret = o2nm_depend_this_node();
+static int o2hb_region_inc_user(const char *region_uuid)
-        if (ret)
+{
-                goto out;
+        int ret = 0;
-        ret = o2nm_depend_item(&reg->hr_item);
+        spin_lock(&o2hb_live_lock);
-        if (ret)
-                o2nm_undepend_this_node();
-out:
+        /* local heartbeat */
+        if (!o2hb_global_heartbeat_active()) {
+            ret = o2hb_region_pin(region_uuid);
+            goto unlock;
+        }
+        /*
+         * if global heartbeat active and this is the first dependent user,
+         * pin all regions if quorum region count <= CUT_OFF
+         */
+        o2hb_dependent_users++;
+        if (o2hb_dependent_users > 1)
+                goto unlock;
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+                ret = o2hb_region_pin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
        return ret;
 }
-static void o2hb_region_put(const char *region_uuid)
+void o2hb_region_dec_user(const char *region_uuid)
 {
-        struct o2hb_region *reg;
        spin_lock(&o2hb_live_lock);
-        reg = o2hb_find_region(region_uuid);
+        /* local heartbeat */
+        if (!o2hb_global_heartbeat_active()) {
+            o2hb_region_unpin(region_uuid);
+            goto unlock;
+        }
-        spin_unlock(&o2hb_live_lock);
+        /*
+         * if global heartbeat active and there are no dependent users,
+         * unpin all quorum regions
+         */
+        o2hb_dependent_users--;
+        if (!o2hb_dependent_users)
+                o2hb_region_unpin(NULL);
-        if (reg) {
+unlock:
-                o2nm_undepend_item(&reg->hr_item);
+        spin_unlock(&o2hb_live_lock);
-                o2nm_undepend_this_node();
-        }
 }
 int o2hb_register_callback(const char *region_uuid,
@@ -1847,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
        }
        if (region_uuid) {
-                ret = o2hb_region_get(region_uuid);
+                ret = o2hb_region_inc_user(region_uuid);
-                if (ret)
+                if (ret) {
+                        mlog_errno(ret);
                        goto out;
+                }
        }
        down_write(&o2hb_callback_sem);
@@ -1867,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
        up_write(&o2hb_callback_sem);
        ret = 0;
 out:
-        mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+        mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
             ret, __builtin_return_address(0), hc);
        return ret;
 }
@@ -1878,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
 {
        BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
-        mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+        mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
             __builtin_return_address(0), hc);
        /* XXX Can this happen _with_ a region reference? */
@@ -1886,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
                return;
        if (region_uuid)
-                o2hb_region_put(region_uuid);
+                o2hb_region_dec_user(region_uuid);
        down_write(&o2hb_callback_sem);
@@ -1963,3 +2577,34 @@ void o2hb_stop_all_regions(void)
        spin_unlock(&o2hb_live_lock);
 }
 EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
+int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
+{
+        struct o2hb_region *reg;
+        int numregs = 0;
+        char *p;
+        spin_lock(&o2hb_live_lock);
+        p = region_uuids;
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+                mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
+                if (numregs < max_regions) {
+                        memcpy(p, config_item_name(&reg->hr_item),
+                               O2HB_MAX_REGION_NAME_LEN);
+                        p += O2HB_MAX_REGION_NAME_LEN;
+                }
+                numregs++;
+        }
+        spin_unlock(&o2hb_live_lock);
+        return numregs;
+}
+EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
+int o2hb_global_heartbeat_active(void)
+{
+        return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
+}
+EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b49..00ad8e8fea51 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
 #define O2HB_REGION_TIMEOUT_MS          2000
+#define O2HB_MAX_REGION_NAME_LEN        32
 /* number of changes to be seen as live */
 #define O2HB_LIVE_THRESHOLD        2
 /* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
 void o2hb_stop_all_regions(void);
+int o2hb_get_all_regions(char *region_uuids, u8 numregions);
+int o2hb_global_heartbeat_active(void);
 #endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index c7fba396392d..6c61771469af 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(QUOTA),
        define_mask(REFCOUNT),
        define_mask(BASTS),
+        define_mask(RESERVATIONS),
+        define_mask(CLUSTER),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
-        define_mask(RESERVATIONS),
 };
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index fd96e2a2fa56..34d6544357d9 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -81,7 +81,7 @@
 #include <linux/sched.h>
 /* bits that are frequently given and infrequently matched in the low word */
-/* NOTE: If you add a flag, you need to also update mlog.c! */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
 #define ML_ENTRY        0x0000000000000001ULL /* func call entry */
 #define ML_EXIT         0x0000000000000002ULL /* func call exit */
 #define ML_TCP          0x0000000000000004ULL /* net cluster/tcp.c */
@@ -114,12 +114,14 @@
 #define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_REFCOUNT     0x0000000080000000ULL /* refcount tree operations */
-#define ML_BASTS        0x0000001000000000ULL /* dlmglue asts and basts */
+#define ML_BASTS        0x0000000100000000ULL /* dlmglue asts and basts */
+#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */
+#define ML_CLUSTER      0x0000000400000000ULL /* cluster stack */
 /* bits that are infrequently given and frequently matched in the high word */
-#define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
+#define ML_ERROR        0x1000000000000000ULL /* sent to KERN_ERR */
-#define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
+#define ML_NOTICE       0x2000000000000000ULL /* setn to KERN_NOTICE */
-#define ML_KTHREAD      0x0000000400000000ULL /* kernel thread activity */
+#define ML_KTHREAD      0x4000000000000000ULL /* kernel thread activity */
-#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e52b02..3a5835904b3d 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,15 @@
 #define O2NET_DEBUG_DIR         "o2net"
 #define SC_DEBUG_NAME           "sock_containers"
 #define NST_DEBUG_NAME          "send_tracking"
+#define STATS_DEBUG_NAME        "stats"
+#define SHOW_SOCK_CONTAINERS    0
+#define SHOW_SOCK_STATS         1
 static struct dentry *o2net_dentry;
 static struct dentry *sc_dentry;
 static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
 static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static int nst_seq_show(struct seq_file *seq, void *v)
 {
        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        ktime_t now;
+        s64 sock, send, status;
        spin_lock(&o2net_debug_lock);
        nst = next_nst(dummy_nst);
+        if (!nst)
+                goto out;
-        if (nst != NULL) {
+        now = ktime_get();
-                /* get_task_comm isn't exported.  oh well. */
+        sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
-                seq_printf(seq, "%p:\n"
+        send = ktime_to_us(ktime_sub(now, nst->st_send_time));
-                           "  pid:          %lu\n"
+        status = ktime_to_us(ktime_sub(now, nst->st_status_time));
-                           "  tgid:         %lu\n"
-                           "  process name: %s\n"
+        /* get_task_comm isn't exported.  oh well. */
-                           "  node:         %u\n"
+        seq_printf(seq, "%p:\n"
-                           "  sc:           %p\n"
+                   "  pid:          %lu\n"
-                           "  message id:   %d\n"
+                   "  tgid:         %lu\n"
-                           "  message type: %u\n"
+                   "  process name: %s\n"
-                           "  message key:  0x%08x\n"
+                   "  node:         %u\n"
-                           "  sock acquiry: %lu.%ld\n"
+                   "  sc:           %p\n"
-                           "  send start:   %lu.%ld\n"
+                   "  message id:   %d\n"
-                           "  wait start:   %lu.%ld\n",
+                   "  message type: %u\n"
-                           nst, (unsigned long)nst->st_task->pid,
+                   "  message key:  0x%08x\n"
-                           (unsigned long)nst->st_task->tgid,
+                   "  sock acquiry: %lld usecs ago\n"
-                           nst->st_task->comm, nst->st_node,
+                   "  send start:   %lld usecs ago\n"
-                           nst->st_sc, nst->st_id, nst->st_msg_type,
+                   "  wait start:   %lld usecs ago\n",
-                           nst->st_msg_key,
+                   nst, (unsigned long)task_pid_nr(nst->st_task),
-                           nst->st_sock_time.tv_sec,
+                   (unsigned long)nst->st_task->tgid,
-                           (long)nst->st_sock_time.tv_usec,
+                   nst->st_task->comm, nst->st_node,
-                           nst->st_send_time.tv_sec,
+                   nst->st_sc, nst->st_id, nst->st_msg_type,
-                           (long)nst->st_send_time.tv_usec,
+                   nst->st_msg_key,
-                           nst->st_status_time.tv_sec,
+                   (long long)sock,
-                           (long)nst->st_status_time.tv_usec);
+                   (long long)send,
-        }
+                   (long long)status);
+out:
        spin_unlock(&o2net_debug_lock);
        return 0;
@@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
        spin_unlock(&o2net_debug_lock);
 }
+struct o2net_sock_debug {
+        int dbg_ctxt;
+        struct o2net_sock_container *dbg_sock;
+};
 static struct o2net_sock_container
                        *next_sc(struct o2net_sock_container *sc_start)
 {
@@ -253,7 +268,8 @@ static struct o2net_sock_container
 static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
@@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
@@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        return sc; /* unused, just needs to be null when done */
 }
-#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s)              ((_s)->sc_send_count)
+# define sc_recv_count(_s)              ((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s)     (ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s)        (ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s)      (ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s)     (ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s)              (0U)
+# define sc_recv_count(_s)              (0U)
+# define sc_tv_acquiry_total_ns(_s)     (0LL)
+# define sc_tv_send_total_ns(_s)        (0LL)
+# define sc_tv_status_total_ns(_s)      (0LL)
+# define sc_tv_process_total_ns(_s)     (0LL)
+#endif
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION         1
+static void sc_show_sock_stats(struct seq_file *seq,
+                               struct o2net_sock_container *sc)
+{
+        if (!sc)
+                return;
+        seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+                   sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+                   (long long)sc_tv_acquiry_total_ns(sc),
+                   (long long)sc_tv_send_total_ns(sc),
+                   (long long)sc_tv_status_total_ns(sc),
+                   (unsigned long)sc_recv_count(sc),
+                   (long long)sc_tv_process_total_ns(sc));
+}
+static void sc_show_sock_container(struct seq_file *seq,
+                                   struct o2net_sock_container *sc)
+{
+        struct inet_sock *inet = NULL;
+        __be32 saddr = 0, daddr = 0;
+        __be16 sport = 0, dport = 0;
+        if (!sc)
+                return;
+        if (sc->sc_sock) {
+                inet = inet_sk(sc->sc_sock->sk);
+                /* the stack's structs aren't sparse endian clean */
+                saddr = (__force __be32)inet->inet_saddr;
+                daddr = (__force __be32)inet->inet_daddr;
+                sport = (__force __be16)inet->inet_sport;
+                dport = (__force __be16)inet->inet_dport;
+        }
+        /* XXX sigh, inet-> doesn't have sparse annotation so any
+         * use of it here generates a warning with -Wbitwise */
+        seq_printf(seq, "%p:\n"
+                   "  krefs:           %d\n"
+                   "  sock:            %pI4:%u -> "
+                                      "%pI4:%u\n"
+                   "  remote node:     %s\n"
+                   "  page off:        %zu\n"
+                   "  handshake ok:    %u\n"
+                   "  timer:           %lld usecs\n"
+                   "  data ready:      %lld usecs\n"
+                   "  advance start:   %lld usecs\n"
+                   "  advance stop:    %lld usecs\n"
+                   "  func start:      %lld usecs\n"
+                   "  func stop:       %lld usecs\n"
+                   "  func key:        0x%08x\n"
+                   "  func type:       %u\n",
+                   sc,
+                   atomic_read(&sc->sc_kref.refcount),
+                   &saddr, inet ? ntohs(sport) : 0,
+                   &daddr, inet ? ntohs(dport) : 0,
+                   sc->sc_node->nd_name,
+                   sc->sc_page_off,
+                   sc->sc_handshake_ok,
+                   (long long)ktime_to_us(sc->sc_tv_timer),
+                   (long long)ktime_to_us(sc->sc_tv_data_ready),
+                   (long long)ktime_to_us(sc->sc_tv_advance_start),
+                   (long long)ktime_to_us(sc->sc_tv_advance_stop),
+                   (long long)ktime_to_us(sc->sc_tv_func_start),
+                   (long long)ktime_to_us(sc->sc_tv_func_stop),
+                   sc->sc_msg_key,
+                   sc->sc_msg_type);
+}
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
-        if (sc != NULL) {
+        if (sc) {
-                struct inet_sock *inet = NULL;
+                if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+                        sc_show_sock_container(seq, sc);
-                __be32 saddr = 0, daddr = 0;
+                else
-                __be16 sport = 0, dport = 0;
+                        sc_show_sock_stats(seq, sc);
-                if (sc->sc_sock) {
-                        inet = inet_sk(sc->sc_sock->sk);
-                        /* the stack's structs aren't sparse endian clean */
-                        saddr = (__force __be32)inet->inet_saddr;
-                        daddr = (__force __be32)inet->inet_daddr;
-                        sport = (__force __be16)inet->inet_sport;
-                        dport = (__force __be16)inet->inet_dport;
-                }
-                /* XXX sigh, inet-> doesn't have sparse annotation so any
-                 * use of it here generates a warning with -Wbitwise */
-                seq_printf(seq, "%p:\n"
-                           "  krefs:           %d\n"
-                           "  sock:            %pI4:%u -> "
-                                              "%pI4:%u\n"
-                           "  remote node:     %s\n"
-                           "  page off:        %zu\n"
-                           "  handshake ok:    %u\n"
-                           "  timer:           %lu.%ld\n"
-                           "  data ready:      %lu.%ld\n"
-                           "  advance start:   %lu.%ld\n"
-                           "  advance stop:    %lu.%ld\n"
-                           "  func start:      %lu.%ld\n"
-                           "  func stop:       %lu.%ld\n"
-                           "  func key:        %u\n"
-                           "  func type:       %u\n",
-                           sc,
-                           atomic_read(&sc->sc_kref.refcount),
-                           &saddr, inet ? ntohs(sport) : 0,
-                           &daddr, inet ? ntohs(dport) : 0,
-                           sc->sc_node->nd_name,
-                           sc->sc_page_off,
-                           sc->sc_handshake_ok,
-                           TV_SEC_USEC(sc->sc_tv_timer),
-                           TV_SEC_USEC(sc->sc_tv_data_ready),
-                           TV_SEC_USEC(sc->sc_tv_advance_start),
-                           TV_SEC_USEC(sc->sc_tv_advance_stop),
-                           TV_SEC_USEC(sc->sc_tv_func_start),
-                           TV_SEC_USEC(sc->sc_tv_func_stop),
-                           sc->sc_msg_key,
-                           sc->sc_msg_type);
        }
        spin_unlock(&o2net_debug_lock);
        return 0;
@@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = {
        .show = sc_seq_show,
 };
-static int sc_fop_open(struct inode *inode, struct file *file)
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
 {
        struct o2net_sock_container *dummy_sc;
        struct seq_file *seq;
@@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
                goto out;
        seq = file->private_data;
-        seq->private = dummy_sc;
+        seq->private = sd;
+        sd->dbg_sock = dummy_sc;
        o2net_debug_add_sc(dummy_sc);
        dummy_sc = NULL;
@@ -382,12 +442,48 @@ out:
 static int sc_fop_release(struct inode *inode, struct file *file)
 {
        struct seq_file *seq = file->private_data;
-        struct o2net_sock_container *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *dummy_sc = sd->dbg_sock;
        o2net_debug_del_sc(dummy_sc);
        return seq_release_private(inode, file);
 }
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_debug *sd;
+        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+        if (sd == NULL)
+                return -ENOMEM;
+        sd->dbg_ctxt = SHOW_SOCK_STATS;
+        sd->dbg_sock = NULL;
+        return sc_common_open(file, sd);
+}
+static const struct file_operations stats_seq_fops = {
+        .open = stats_fop_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = sc_fop_release,
+};
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_debug *sd;
+        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+        if (sd == NULL)
+                return -ENOMEM;
+        sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+        sd->dbg_sock = NULL;
+        return sc_common_open(file, sd);
+}
 static const struct file_operations sc_seq_fops = {
        .open = sc_fop_open,
        .read = seq_read,
@@ -419,25 +515,29 @@ int o2net_debugfs_init(void)
                goto bail;
        }
+        stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                           o2net_dentry, NULL,
+                                           &stats_seq_fops);
+        if (!stats_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
        return 0;
 bail:
-        if (sc_dentry)
+        debugfs_remove(stats_dentry);
-                debugfs_remove(sc_dentry);
+        debugfs_remove(sc_dentry);
-        if (nst_dentry)
+        debugfs_remove(nst_dentry);
-                debugfs_remove(nst_dentry);
+        debugfs_remove(o2net_dentry);
-        if (o2net_dentry)
-                debugfs_remove(o2net_dentry);
        return -ENOMEM;
 }
 void o2net_debugfs_exit(void)
 {
-        if (sc_dentry)
+        debugfs_remove(stats_dentry);
-                debugfs_remove(sc_dentry);
+        debugfs_remove(sc_dentry);
-        if (nst_dentry)
+        debugfs_remove(nst_dentry);
-                debugfs_remove(nst_dentry);
+        debugfs_remove(o2net_dentry);
-        if (o2net_dentry)
-                debugfs_remove(o2net_dentry);
 }
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ed0c9f367fed..bb240647ca5f 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
        config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
        spin_lock_init(&node->nd_lock);
+        mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
        return &node->nd_item;
 }
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
        }
        write_unlock(&cluster->cl_nodes_lock);
+        mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
+             config_item_name(&node->nd_item));
        config_item_put(item);
 }
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad571..49b594325bec 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
 /* host name, group name, cluster name all 64 bytes */
 #define O2NM_MAX_NAME_LEN        64    // __NEW_UTS_LEN
+/*
+ * Maximum number of global heartbeat regions allowed.
+ * **CAUTION**  Changing this number will break dlm compatibility.
+ */
+#define O2NM_MAX_REGIONS        32
 #endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index cf3e16696216..a87366750f23 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -325,5 +325,7 @@ void o2quo_init(void)
 void o2quo_exit(void)
 {
-        flush_scheduled_work();
+        struct o2quo_state *qs = &o2quo_state;
+        flush_work_sync(&qs->qs_work);
 }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index cbe2f057cc28..3b11cb1e38fc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
        nst->st_node = node;
 }
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_sock_time);
+        nst->st_sock_time = ktime_get();
 }
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_send_time);
+        nst->st_send_time = ktime_get();
 }
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_status_time);
+        nst->st_status_time = ktime_get();
 }
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-                                         struct o2net_sock_container *sc)
+                                                struct o2net_sock_container *sc)
 {
        nst->st_sc = sc;
 }
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+                                        u32 msg_id)
 {
        nst->st_id = msg_id;
 }
-#else  /* CONFIG_DEBUG_FS */
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                                  u32 msgkey, struct task_struct *task, u8 node)
 {
+        sc->sc_tv_timer = ktime_get();
 }
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_data_ready = ktime_get();
 }
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_advance_start = ktime_get();
 }
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_advance_stop = ktime_get();
 }
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
-                                                struct o2net_sock_container *sc)
 {
+        sc->sc_tv_func_start = ktime_get();
 }
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
-                                        u32 msg_id)
 {
+        sc->sc_tv_func_stop = ktime_get();
 }
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+        return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+#else  /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
+# define o2net_get_func_run_time(a)             (ktime_t)0
 #endif /* CONFIG_DEBUG_FS */
+#ifdef CONFIG_OCFS2_FS_STATS
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+                                    struct o2net_sock_container *sc)
+{
+        sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+                                           ktime_sub(ktime_get(),
+                                                     nst->st_status_time));
+        sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+                                         ktime_sub(nst->st_status_time,
+                                                   nst->st_send_time));
+        sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+                                            ktime_sub(nst->st_send_time,
+                                                      nst->st_sock_time));
+        sc->sc_send_count++;
+}
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+        sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+                                            o2net_get_func_run_time(sc));
+        sc->sc_recv_count++;
+}
+#else
+# define o2net_update_send_stats(a, b)
+# define o2net_update_recv_stats(sc)
+#endif /* CONFIG_OCFS2_FS_STATS */
 static inline int o2net_reconnect_delay(void)
 {
        return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref)
                sc->sc_sock = NULL;
        }
+        o2nm_undepend_item(&sc->sc_node->nd_item);
        o2nm_node_put(sc->sc_node);
        sc->sc_node = NULL;
@@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 {
        struct o2net_sock_container *sc, *ret = NULL;
        struct page *page = NULL;
+        int status = 0;
        page = alloc_page(GFP_NOFS);
        sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
        o2nm_node_get(node);
        sc->sc_node = node;
+        /* pin the node item of the remote node */
+        status = o2nm_depend_item(&node->nd_item);
+        if (status) {
+                mlog_errno(status);
+                o2nm_node_put(node);
+                goto out;
+        }
        INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
        INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
        INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
        if (sk->sk_user_data) {
                struct o2net_sock_container *sc = sk->sk_user_data;
                sclog(sc, "data_ready hit\n");
-                do_gettimeofday(&sc->sc_tv_data_ready);
+                o2net_set_data_ready_time(sc);
                o2net_sc_queue_work(sc, &sc->sc_rx_work);
                ready = sc->sc_data_ready;
        } else {
@@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        o2net_set_nst_status_time(&nst);
        wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+        o2net_update_send_stats(&nst, sc);
        /* Note that we avoid overwriting the callers status return
         * variable if a system error was reported on the other
         * side. Callers beware. */
@@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
        if (syserr != O2NET_ERR_NONE)
                goto out_respond;
-        do_gettimeofday(&sc->sc_tv_func_start);
+        o2net_set_func_start_time(sc);
        sc->sc_msg_key = be32_to_cpu(hdr->key);
        sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
        handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
                                             be16_to_cpu(hdr->data_len),
                                        nmh->nh_func_data, &ret_data);
-        do_gettimeofday(&sc->sc_tv_func_stop);
+        o2net_set_func_stop_time(sc);
+        o2net_update_recv_stats(sc);
 out_respond:
        /* this destroys the hdr, so don't use it after this */
@@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
        size_t datalen;
        sclog(sc, "receiving\n");
-        do_gettimeofday(&sc->sc_tv_advance_start);
+        o2net_set_advance_start_time(sc);
        if (unlikely(sc->sc_handshake_ok == 0)) {
                if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
 out:
        sclog(sc, "ret = %d\n", ret);
-        do_gettimeofday(&sc->sc_tv_advance_stop);
+        o2net_set_advance_stop_time(sc);
        return ret;
 }
@@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data)
 {
        struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-        struct timeval now;
-        do_gettimeofday(&now);
+#ifdef CONFIG_DEBUG_FS
+        ktime_t now = ktime_get();
+#endif
        printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
                     o2net_idle_timeout() / 1000,
                     o2net_idle_timeout() % 1000);
-        mlog(ML_NOTICE, "here are some times that might help debug the "
-             "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
+#ifdef CONFIG_DEBUG_FS
-             "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
+        mlog(ML_NOTICE, "Here are some times that might help debug the "
-             sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
+             "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
-             now.tv_sec, (long) now.tv_usec,
+             "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
-             sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
-             sc->sc_tv_advance_start.tv_sec,
+             (long long)ktime_to_us(sc->sc_tv_data_ready),
-             (long) sc->sc_tv_advance_start.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_advance_start),
-             sc->sc_tv_advance_stop.tv_sec,
+             (long long)ktime_to_us(sc->sc_tv_advance_stop),
-             (long) sc->sc_tv_advance_stop.tv_usec,
             sc->sc_msg_key, sc->sc_msg_type,
-             sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_func_start),
-             sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+             (long long)ktime_to_us(sc->sc_tv_func_stop));
+#endif
        /*
         * Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
        o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
        o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
                      msecs_to_jiffies(o2net_keepalive_delay()));
-        do_gettimeofday(&sc->sc_tv_timer);
+        o2net_set_sock_timer(sc);
        mod_timer(&sc->sc_idle_timeout,
               jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
@@ -1696,6 +1761,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
 {
        o2quo_hb_down(node_num);
+        if (!node)
+                return;
        if (node_num != o2nm_this_node())
                o2net_disconnect_node(node);
@@ -1709,6 +1777,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
        o2quo_hb_up(node_num);
+        BUG_ON(!node);
        /* ensure an immediate connect attempt */
        nn->nn_last_connect_attempt = jiffies -
                (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 96fa7ebc530c..4cbcb65784a3 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -129,7 +129,7 @@ struct o2net_node {
 struct o2net_sock_container {
        struct kref             sc_kref;
-        /* the next two are vaild for the life time of the sc */
+        /* the next two are valid for the life time of the sc */
        struct socket           *sc_sock;
        struct o2nm_node        *sc_node;
@@ -166,18 +166,27 @@ struct o2net_sock_container {
        /* original handlers for the sockets */
        void                    (*sc_state_change)(struct sock *sk);
        void                    (*sc_data_ready)(struct sock *sk, int bytes);
-#ifdef CONFIG_DEBUG_FS
-        struct list_head        sc_net_debug_item;
-#endif
-        struct timeval          sc_tv_timer;
-        struct timeval          sc_tv_data_ready;
-        struct timeval          sc_tv_advance_start;
-        struct timeval          sc_tv_advance_stop;
-        struct timeval          sc_tv_func_start;
-        struct timeval          sc_tv_func_stop;
        u32                     sc_msg_key;
        u16                     sc_msg_type;
+#ifdef CONFIG_DEBUG_FS
+        struct list_head        sc_net_debug_item;
+        ktime_t                 sc_tv_timer;
+        ktime_t                 sc_tv_data_ready;
+        ktime_t                 sc_tv_advance_start;
+        ktime_t                 sc_tv_advance_stop;
+        ktime_t                 sc_tv_func_start;
+        ktime_t                 sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+        ktime_t                 sc_tv_acquiry_total;
+        ktime_t                 sc_tv_send_total;
+        ktime_t                 sc_tv_status_total;
+        u32                     sc_send_count;
+        u32                     sc_recv_count;
+        ktime_t                 sc_tv_process_total;
+#endif
        struct mutex            sc_send_lock;
 };
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
        u32                             st_msg_type;
        u32                             st_msg_key;
        u8                              st_node;
-        struct timeval                  st_sock_time;
+        ktime_t                         st_sock_time;
-        struct timeval                  st_send_time;
+        ktime_t                         st_send_time;
-        struct timeval                  st_status_time;
+        ktime_t                         st_status_time;
 };
 #else
 struct o2net_send_tracking {
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7d9fe2..6d80ecc7834f 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,22 +40,45 @@
 #include "inode.h"
 #include "super.h"
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+        unsigned long gen =
+                OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+        BUG_ON(dentry->d_inode);
+        dentry->d_fsdata = (void *)gen;
+}
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
                                   struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        int ret = 0;    /* if all else fails, just return false */
-        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+        struct ocfs2_super *osb;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        osb = OCFS2_SB(dentry->d_sb);
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
-        /* Never trust a negative dentry - force a new lookup. */
+        /* For a negative dentry -
+         * check the generation number of the parent and compare with the
+         * one stored in the inode.
+         */
        if (inode == NULL) {
-                mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
+                unsigned long gen = (unsigned long) dentry->d_fsdata;
-                     dentry->d_name.name);
+                unsigned long pgen =
-                goto bail;
+                        OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+                mlog(0, "negative dentry: %.*s parent gen: %lu "
+                        "dentry gen: %lu\n",
+                        dentry->d_name.len, dentry->d_name.name, pgen, gen);
+                if (gen != pgen)
+                        goto bail;
+                goto valid;
        }
        BUG_ON(!osb);
@@ -96,6 +119,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
                goto bail;
        }
+valid:
        ret = 1;
 bail:
@@ -151,23 +175,25 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
        struct list_head *p;
        struct dentry *dentry = NULL;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each(p, &inode->i_dentry) {
                dentry = list_entry(p, struct dentry, d_alias);
+                spin_lock(&dentry->d_lock);
                if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
                        mlog(0, "dentry found: %.*s\n",
                             dentry->d_name.len, dentry->d_name.name);
-                        dget_locked(dentry);
+                        dget_dlock(dentry);
+                        spin_unlock(&dentry->d_lock);
                        break;
                }
+                spin_unlock(&dentry->d_lock);
                dentry = NULL;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        return dentry;
 }
@@ -227,6 +253,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
        if (!inode)
                return 0;
+        if (!dentry->d_inode && dentry->d_fsdata) {
+                /* Converting a negative dentry to positive
+                   Clear dentry->d_fsdata */
+                dentry->d_fsdata = dl = NULL;
+        }
        if (dl) {
                mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
                                " \"%.*s\": old parent: %llu, new: %llu\n",
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd1789acf1..b79eff709958 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
                       struct inode *old_dir, struct inode *new_dir);
 extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
 #endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c49f6de0e7ab..d417b3f9b0c7 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        di->i_dx_root = cpu_to_le64(dr_blkno);
+        spin_lock(&OCFS2_I(dir)->ip_lock);
        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(dir)->ip_lock);
        ocfs2_journal_dirty(handle, di_bh);
@@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
                goto out_commit;
        }
+        spin_lock(&OCFS2_I(dir)->ip_lock);
        OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(dir)->ip_lock);
        di->i_dx_root = cpu_to_le64(0ULL);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f44999156839..3a3ed4bb794b 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
+        struct dlm_lock_resource *res;
        BUG_ON(!dlm);
        BUG_ON(!lock);
+        res = lock->lockres;
        assert_spin_locked(&dlm->ast_lock);
        if (!list_empty(&lock->ast_list)) {
-                mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
+                mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+                     "AST list not empty, pending %d, newlevel %d\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
                     lock->ast_pending, lock->ml.type);
                BUG();
        }
        if (lock->ast_pending)
-                mlog(0, "lock has an ast getting flushed right now\n");
+                mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        /* putting lock on list, add a ref */
        dlm_lock_get(lock);
@@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
        /* check to see if this ast obsoletes the bast */
        if (dlm_should_cancel_bast(dlm, lock)) {
-                struct dlm_lock_resource *res = lock->lockres;
+                mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
-                mlog(0, "%s: cancelling bast for %.*s\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
-                     dlm->name, res->lockname.len, res->lockname.name);
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
                lock->bast_pending = 0;
                list_del_init(&lock->bast_list);
                lock->ml.highest_blocked = LKM_IVMODE;
@@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
        BUG_ON(!dlm);
        BUG_ON(!lock);
@@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
+        struct dlm_lock_resource *res;
        BUG_ON(!dlm);
        BUG_ON(!lock);
        assert_spin_locked(&dlm->ast_lock);
+        res = lock->lockres;
        BUG_ON(!list_empty(&lock->bast_list));
        if (lock->bast_pending)
-                mlog(0, "lock has a bast getting flushed right now\n");
+                mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        /* putting lock on list, add a ref */
        dlm_lock_get(lock);
@@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
        BUG_ON(!dlm);
        BUG_ON(!lock);
@@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        dlm_astlockfunc_t *fn;
        struct dlm_lockstatus *lksb;
-        mlog_entry_void();
+        mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+             res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        lksb = lock->lksb;
        fn = lock->ast;
@@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        struct dlm_lockstatus *lksb;
        int lksbflags;
-        mlog_entry_void();
+        mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+             res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        lksb = lock->lksb;
        BUG_ON(lock->ml.node == dlm->node_num);
@@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 {
        dlm_bastlockfunc_t *fn = lock->bast;
-        mlog_entry_void();
        BUG_ON(lock->ml.node != dlm->node_num);
+        mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+             dlm->name, res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+             blocked_type);
        (*fn)(lock->astdata, blocked_type);
 }
@@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        /* cannot get a proxy ast message if this node owns it */
        BUG_ON(res->owner == dlm->node_num);
-        mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+        mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -382,8 +407,12 @@ do_ast:
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
                list_move_tail(&lock->list, &res->granted);
-                mlog(0, "ast: Adding to granted list... type=%d, "
+                mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
-                     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+                     lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
                        lock->ml.type = lock->ml.convert_type;
                        lock->ml.convert_type = LKM_IVMODE;
@@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        size_t veclen = 1;
        int status;
-        mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
+        mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
-                   res->lockname.len, res->lockname.name, lock->ml.node,
+             res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
-                   msg_type, blocked_type);
+             blocked_type);
        memset(&past, 0, sizeof(struct dlm_proxy_ast));
        past.node_idx = dlm->node_num;
@@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        vec[0].iov_len = sizeof(struct dlm_proxy_ast);
        vec[0].iov_base = &past;
        if (flags & DLM_LKSB_GET_LVB) {
-                mlog(0, "returning requested LVB data\n");
                be32_add_cpu(&past.flags, LKM_GET_LVB);
                vec[1].iov_len = DLM_LVB_LEN;
                vec[1].iov_base = lock->lksb->lvb;
@@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
                                     lock->ml.node, &status);
        if (ret < 0)
-                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
-                     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+                     dlm->name, res->lockname.len, res->lockname.name, ret,
                     lock->ml.node);
        else {
                if (status == DLM_RECOVERING) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 765298908f1d..4bdf7baee344 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 enum dlm_mle_type {
-        DLM_MLE_BLOCK,
+        DLM_MLE_BLOCK = 0,
-        DLM_MLE_MASTER,
+        DLM_MLE_MASTER = 1,
-        DLM_MLE_MIGRATION,
+        DLM_MLE_MIGRATION = 2,
-        DLM_MLE_NUM_TYPES
+        DLM_MLE_NUM_TYPES = 3,
 };
 struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
 enum dlm_ast_type {
        DLM_AST = 0,
-        DLM_BAST,
+        DLM_BAST = 1,
-        DLM_ASTUNLOCK
+        DLM_ASTUNLOCK = 2,
 };
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
 enum dlm_ctxt_state {
        DLM_CTXT_NEW = 0,
-        DLM_CTXT_JOINED,
+        DLM_CTXT_JOINED = 1,
-        DLM_CTXT_IN_SHUTDOWN,
+        DLM_CTXT_IN_SHUTDOWN = 2,
-        DLM_CTXT_LEAVING,
+        DLM_CTXT_LEAVING = 3,
 };
 struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
 enum dlm_lockres_list {
        DLM_GRANTED_LIST = 0,
-        DLM_CONVERTING_LIST,
+        DLM_CONVERTING_LIST = 1,
-        DLM_BLOCKED_LIST
+        DLM_BLOCKED_LIST = 2,
 };
 static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,25 +427,27 @@ struct dlm_node_iter
 enum {
-        DLM_MASTER_REQUEST_MSG    = 500,
+        DLM_MASTER_REQUEST_MSG          = 500,
-        DLM_UNUSED_MSG1,         /* 501 */
+        DLM_UNUSED_MSG1                 = 501,
-        DLM_ASSERT_MASTER_MSG,   /* 502 */
+        DLM_ASSERT_MASTER_MSG           = 502,
-        DLM_CREATE_LOCK_MSG,     /* 503 */
+        DLM_CREATE_LOCK_MSG             = 503,
-        DLM_CONVERT_LOCK_MSG,    /* 504 */
+        DLM_CONVERT_LOCK_MSG            = 504,
-        DLM_PROXY_AST_MSG,       /* 505 */
+        DLM_PROXY_AST_MSG               = 505,
-        DLM_UNLOCK_LOCK_MSG,     /* 506 */
+        DLM_UNLOCK_LOCK_MSG             = 506,
-        DLM_DEREF_LOCKRES_MSG,   /* 507 */
+        DLM_DEREF_LOCKRES_MSG           = 507,
-        DLM_MIGRATE_REQUEST_MSG, /* 508 */
+        DLM_MIGRATE_REQUEST_MSG         = 508,
-        DLM_MIG_LOCKRES_MSG,     /* 509 */
+        DLM_MIG_LOCKRES_MSG             = 509,
-        DLM_QUERY_JOIN_MSG,      /* 510 */
+        DLM_QUERY_JOIN_MSG              = 510,
-        DLM_ASSERT_JOINED_MSG,   /* 511 */
+        DLM_ASSERT_JOINED_MSG           = 511,
-        DLM_CANCEL_JOIN_MSG,     /* 512 */
+        DLM_CANCEL_JOIN_MSG             = 512,
-        DLM_EXIT_DOMAIN_MSG,     /* 513 */
+        DLM_EXIT_DOMAIN_MSG             = 513,
-        DLM_MASTER_REQUERY_MSG,  /* 514 */
+        DLM_MASTER_REQUERY_MSG          = 514,
-        DLM_LOCK_REQUEST_MSG,    /* 515 */
+        DLM_LOCK_REQUEST_MSG            = 515,
-        DLM_RECO_DATA_DONE_MSG,  /* 516 */
+        DLM_RECO_DATA_DONE_MSG          = 516,
-        DLM_BEGIN_RECO_MSG,      /* 517 */
+        DLM_BEGIN_RECO_MSG              = 517,
-        DLM_FINALIZE_RECO_MSG    /* 518 */
+        DLM_FINALIZE_RECO_MSG           = 518,
+        DLM_QUERY_REGION                = 519,
+        DLM_QUERY_NODEINFO              = 520,
 };
 struct dlm_reco_node_data
@@ -458,19 +460,19 @@ struct dlm_reco_node_data
 enum {
        DLM_RECO_NODE_DATA_DEAD = -1,
        DLM_RECO_NODE_DATA_INIT = 0,
-        DLM_RECO_NODE_DATA_REQUESTING,
+        DLM_RECO_NODE_DATA_REQUESTING = 1,
-        DLM_RECO_NODE_DATA_REQUESTED,
+        DLM_RECO_NODE_DATA_REQUESTED = 2,
-        DLM_RECO_NODE_DATA_RECEIVING,
+        DLM_RECO_NODE_DATA_RECEIVING = 3,
-        DLM_RECO_NODE_DATA_DONE,
+        DLM_RECO_NODE_DATA_DONE = 4,
-        DLM_RECO_NODE_DATA_FINALIZE_SENT,
+        DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
 };
 enum {
        DLM_MASTER_RESP_NO = 0,
-        DLM_MASTER_RESP_YES,
+        DLM_MASTER_RESP_YES = 1,
-        DLM_MASTER_RESP_MAYBE,
+        DLM_MASTER_RESP_MAYBE = 2,
-        DLM_MASTER_RESP_ERROR
+        DLM_MASTER_RESP_ERROR = 3,
 };
@@ -647,9 +649,9 @@ struct dlm_proxy_ast
 #define DLM_MOD_KEY (0x666c6172)
 enum dlm_query_join_response_code {
        JOIN_DISALLOW = 0,
-        JOIN_OK,
+        JOIN_OK = 1,
-        JOIN_OK_NO_MAP,
+        JOIN_OK_NO_MAP = 2,
-        JOIN_PROTOCOL_MISMATCH,
+        JOIN_PROTOCOL_MISMATCH = 3,
 };
 struct dlm_query_join_packet {
@@ -727,6 +729,31 @@ struct dlm_cancel_join
        u8 domain[O2NM_MAX_NAME_LEN];
 };
+struct dlm_query_region {
+        u8 qr_node;
+        u8 qr_numregions;
+        u8 qr_namelen;
+        u8 pad1;
+        u8 qr_domain[O2NM_MAX_NAME_LEN];
+        u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
+};
+struct dlm_node_info {
+        u8 ni_nodenum;
+        u8 pad1;
+        u16 ni_ipv4_port;
+        u32 ni_ipv4_address;
+};
+struct dlm_query_nodeinfo {
+        u8 qn_nodenum;
+        u8 qn_numnodes;
+        u8 qn_namelen;
+        u8 pad1;
+        u8 qn_domain[O2NM_MAX_NAME_LEN];
+        struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
+};
 struct dlm_exit_domain
 {
        u8 node_idx;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 901ca52bf86b..04a32be0aeb9 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
        kref_get(&dc->debug_refcnt);
 }
-static struct debug_buffer *debug_buffer_allocate(void)
+static int debug_release(struct inode *inode, struct file *file)
 {
-        struct debug_buffer *db = NULL;
+        free_page((unsigned long)file->private_data);
+        return 0;
-        db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
-        if (!db)
-                goto bail;
-        db->len = PAGE_SIZE;
-        db->buf = kmalloc(db->len, GFP_KERNEL);
-        if (!db->buf)
-                goto bail;
-        return db;
-bail:
-        kfree(db);
-        return NULL;
-}
-static ssize_t debug_buffer_read(struct file *file, char __user *buf,
-                                 size_t nbytes, loff_t *ppos)
-{
-        struct debug_buffer *db = file->private_data;
-        return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
-}
-static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
-{
-        struct debug_buffer *db = file->private_data;
-        loff_t new = -1;
-        switch (whence) {
-        case 0:
-                new = off;
-                break;
-        case 1:
-                new = file->f_pos + off;
-                break;
-        }
-        if (new < 0 || new > db->len)
-                return -EINVAL;
-        return (file->f_pos = new);
 }
-static int debug_buffer_release(struct inode *inode, struct file *file)
+static ssize_t debug_read(struct file *file, char __user *buf,
+                          size_t nbytes, loff_t *ppos)
 {
-        struct debug_buffer *db = file->private_data;
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
-        if (db)
-                kfree(db->buf);
-        kfree(db);
-        return 0;
 }
 /* end - util funcs */
 /* begin - purge list funcs */
-static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        struct dlm_lock_resource *res;
        int out = 0;
        unsigned long total = 0;
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dumping Purgelist for Domain: %s\n", dlm->name);
        spin_lock(&dlm->spinlock);
        list_for_each_entry(res, &dlm->purge_list, purge) {
                ++total;
-                if (db->len - out < 100)
+                if (len - out < 100)
                        continue;
                spin_lock(&res->spinlock);
                out += stringify_lockname(res->lockname.name,
                                          res->lockname.len,
-                                          db->buf + out, db->len - out);
+                                          buf + out, len - out);
-                out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+                out += snprintf(buf + out, len - out, "\t%ld\n",
                                (jiffies - res->last_used)/HZ);
                spin_unlock(&res->spinlock);
        }
        spin_unlock(&dlm->spinlock);
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
-                        "Total on list: %ld\n", total);
        return out;
 }
@@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_purgelist_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_purgelist_print(dlm, db);
+        i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -480,22 +434,22 @@ bail:
 static const struct file_operations debug_purgelist_fops = {
        .open =         debug_purgelist_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end - purge list funcs */
 /* begin - debug mle funcs */
-static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        struct dlm_master_list_entry *mle;
        struct hlist_head *bucket;
        struct hlist_node *list;
        int i, out = 0;
-        unsigned long total = 0, longest = 0, bktcnt;
+        unsigned long total = 0, longest = 0, bucket_count = 0;
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dumping MLEs for Domain: %s\n", dlm->name);
        spin_lock(&dlm->master_lock);
@@ -505,17 +459,17 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        mle = hlist_entry(list, struct dlm_master_list_entry,
                                          master_hash_node);
                        ++total;
-                        ++bktcnt;
+                        ++bucket_count;
-                        if (db->len - out < 200)
+                        if (len - out < 200)
                                continue;
-                        out += dump_mle(mle, db->buf + out, db->len - out);
+                        out += dump_mle(mle, buf + out, len - out);
                }
-                longest = max(longest, bktcnt);
+                longest = max(longest, bucket_count);
-                bktcnt = 0;
+                bucket_count = 0;
        }
        spin_unlock(&dlm->master_lock);
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Total: %ld, Longest: %ld\n", total, longest);
        return out;
 }
@@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_mle_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_mle_print(dlm, db);
+        i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -540,9 +494,9 @@ bail:
 static const struct file_operations debug_mle_fops = {
        .open =         debug_mle_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end - debug mle funcs */
@@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = {
 /* end - debug lockres funcs */
 /* begin - debug state funcs */
-static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        int out = 0;
        struct dlm_reco_node_data *node;
@@ -781,33 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        }
        /* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
-                        "Domain: %s  Key: 0x%08x\n", dlm->name, dlm->key);
+                        "Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
+                        dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
+                        dlm->dlm_locking_proto.pv_minor);
        /* Thread Pid: xxx  Node: xxx  State: xxxxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Thread Pid: %d  Node: %d  State: %s\n",
-                        dlm->dlm_thread_task->pid, dlm->node_num, state);
+                        task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
        /* Number of Joins: xxx  Joining Node: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Number of Joins: %d  Joining Node: %d\n",
                        dlm->num_joins, dlm->joining_node);
        /* Domain Map: xx xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+        out += snprintf(buf + out, len - out, "Domain Map: ");
        out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Live Map: xx xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+        out += snprintf(buf + out, len - out, "Live Map: ");
        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Lock Resources: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Lock Resources: %d (%d)\n",
                        atomic_read(&dlm->res_cur_count),
                        atomic_read(&dlm->res_tot_count));
@@ -819,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                cur_mles += atomic_read(&dlm->mle_cur_count[i]);
        /* MLEs: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "MLEs: %d (%d)\n", cur_mles, tot_mles);
        /*  Blocking: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Blocking: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
        /*  Mastery: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Mastery: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
        /*  Migration: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Migration: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
                        "PendingBASTs=%s\n",
                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -850,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
        /* Purge Count: xxx  Refs: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Purge Count: %d  Refs: %d\n", dlm->purge_count,
                        atomic_read(&dlm->dlm_refs.refcount));
        /* Dead Node: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dead Node: %d\n", dlm->reco.dead_node);
        /* What about DLM_RECO_STATE_FINALIZE? */
@@ -865,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                state = "INACTIVE";
        /* Recovery Pid: xxxx  Master: xxx  State: xxxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Recovery Pid: %d  Master: %d  State: %s\n",
-                        dlm->dlm_reco_thread_task->pid,
+                        task_pid_nr(dlm->dlm_reco_thread_task),
                        dlm->reco.new_master, state);
        /* Recovery Map: xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+        out += snprintf(buf + out, len - out, "Recovery Map: ");
        out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Recovery Node State: */
-        out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+        out += snprintf(buf + out, len - out, "Recovery Node State:\n");
        list_for_each_entry(node, &dlm->reco.node_data, list) {
                switch (node->state) {
                case DLM_RECO_NODE_DATA_INIT:
@@ -905,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        state = "BAD";
                        break;
                }
-                out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+                out += snprintf(buf + out, len - out, "\t%u - %s\n",
                                node->node_num, state);
        }
@@ -917,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_state_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db = NULL;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_state_print(dlm, db);
+        i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -934,9 +890,9 @@ bail:
 static const struct file_operations debug_state_fops = {
        .open =         debug_state_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end  - debug state funcs */
@@ -1000,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
        struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
        if (dc) {
-                if (dc->debug_purgelist_dentry)
+                debugfs_remove(dc->debug_purgelist_dentry);
-                        debugfs_remove(dc->debug_purgelist_dentry);
+                debugfs_remove(dc->debug_mle_dentry);
-                if (dc->debug_mle_dentry)
+                debugfs_remove(dc->debug_lockres_dentry);
-                        debugfs_remove(dc->debug_mle_dentry);
+                debugfs_remove(dc->debug_state_dentry);
-                if (dc->debug_lockres_dentry)
-                        debugfs_remove(dc->debug_lockres_dentry);
-                if (dc->debug_state_dentry)
-                        debugfs_remove(dc->debug_state_dentry);
                dlm_debug_put(dc);
        }
 }
@@ -1038,8 +990,7 @@ bail:
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-        if (dlm->dlm_debugfs_subroot)
+        debugfs_remove(dlm->dlm_debugfs_subroot);
-                debugfs_remove(dlm->dlm_debugfs_subroot);
 }
 /* debugfs root */
@@ -1055,7 +1006,6 @@ int dlm_create_debugfs_root(void)
 void dlm_destroy_debugfs_root(void)
 {
-        if (dlm_debugfs_root)
+        debugfs_remove(dlm_debugfs_root);
-                debugfs_remove(dlm_debugfs_root);
 }
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c7..1f27c4812d1a 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
        struct dentry *debug_purgelist_dentry;
 };
-struct debug_buffer {
-        int len;
-        char *buf;
-};
 struct debug_lockres {
        int dl_len;
        char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 11a5c87fd7f7..7e38a072d720 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -128,10 +128,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
 * will have a negotiated version with the same major number and a minor
 * number equal or smaller.  The dlm_ctxt->dlm_locking_proto field should
 * be used to determine what a running domain is actually using.
+ *
+ * New in version 1.1:
+ *      - Message DLM_QUERY_REGION added to support global heartbeat
+ *      - Message DLM_QUERY_NODEINFO added to allow online node removes
 */
 static const struct dlm_protocol_version dlm_protocol = {
        .pv_major = 1,
-        .pv_minor = 0,
+        .pv_minor = 1,
 };
 #define DLM_DOMAIN_BACKOFF_MS 200
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                                     void **ret_data);
 static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
                                   void **ret_data);
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+                                    void *data, void **ret_data);
 static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
                                   void **ret_data);
 static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -454,8 +460,6 @@ redo_bucket:
                }
                cond_resched_lock(&dlm->spinlock);
                num += n;
-                mlog(0, "%s: touched %d lockreses in bucket %d "
-                     "(tot=%d)\n", dlm->name, n, i, num);
        }
        spin_unlock(&dlm->spinlock);
        wake_up(&dlm->dlm_thread_wq);
@@ -921,6 +925,370 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
        return 0;
 }
+static int dlm_match_regions(struct dlm_ctxt *dlm,
+                             struct dlm_query_region *qr)
+{
+        char *local = NULL, *remote = qr->qr_regions;
+        char *l, *r;
+        int localnr, i, j, foundit;
+        int status = 0;
+        if (!o2hb_global_heartbeat_active()) {
+                if (qr->qr_numregions) {
+                        mlog(ML_ERROR, "Domain %s: Joining node %d has global "
+                             "heartbeat enabled but local node %d does not\n",
+                             qr->qr_domain, qr->qr_node, dlm->node_num);
+                        status = -EINVAL;
+                }
+                goto bail;
+        }
+        if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
+                mlog(ML_ERROR, "Domain %s: Local node %d has global "
+                     "heartbeat enabled but joining node %d does not\n",
+                     qr->qr_domain, dlm->node_num, qr->qr_node);
+                status = -EINVAL;
+                goto bail;
+        }
+        r = remote;
+        for (i = 0; i < qr->qr_numregions; ++i) {
+                mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
+                r += O2HB_MAX_REGION_NAME_LEN;
+        }
+        local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC);
+        if (!local) {
+                status = -ENOMEM;
+                goto bail;
+        }
+        localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
+        /* compare local regions with remote */
+        l = local;
+        for (i = 0; i < localnr; ++i) {
+                foundit = 0;
+                r = remote;
+                for (j = 0; j <= qr->qr_numregions; ++j) {
+                        if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
+                                foundit = 1;
+                                break;
+                        }
+                        r += O2HB_MAX_REGION_NAME_LEN;
+                }
+                if (!foundit) {
+                        status = -EINVAL;
+                        mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+                             "in local node %d but not in joining node %d\n",
+                             qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
+                             dlm->node_num, qr->qr_node);
+                        goto bail;
+                }
+                l += O2HB_MAX_REGION_NAME_LEN;
+        }
+        /* compare remote with local regions */
+        r = remote;
+        for (i = 0; i < qr->qr_numregions; ++i) {
+                foundit = 0;
+                l = local;
+                for (j = 0; j < localnr; ++j) {
+                        if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
+                                foundit = 1;
+                                break;
+                        }
+                        l += O2HB_MAX_REGION_NAME_LEN;
+                }
+                if (!foundit) {
+                        status = -EINVAL;
+                        mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+                             "in joining node %d but not in local node %d\n",
+                             qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
+                             qr->qr_node, dlm->node_num);
+                        goto bail;
+                }
+                r += O2HB_MAX_REGION_NAME_LEN;
+        }
+bail:
+        kfree(local);
+        return status;
+}
+static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+        struct dlm_query_region *qr = NULL;
+        int status, ret = 0, i;
+        char *p;
+        if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+                goto bail;
+        qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
+        if (!qr) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto bail;
+        }
+        qr->qr_node = dlm->node_num;
+        qr->qr_namelen = strlen(dlm->name);
+        memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
+        /* if local hb, the numregions will be zero */
+        if (o2hb_global_heartbeat_active())
+                qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
+                                                         O2NM_MAX_REGIONS);
+        p = qr->qr_regions;
+        for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
+                mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
+        i = -1;
+        while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+                                  i + 1)) < O2NM_MAX_NODES) {
+                if (i == dlm->node_num)
+                        continue;
+                mlog(0, "Sending regions to node %d\n", i);
+                ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
+                                         sizeof(struct dlm_query_region),
+                                         i, &status);
+                if (ret >= 0)
+                        ret = status;
+                if (ret) {
+                        mlog(ML_ERROR, "Region mismatch %d, node %d\n",
+                             ret, i);
+                        break;
+                }
+        }
+bail:
+        kfree(qr);
+        return ret;
+}
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+                                    void *data, void **ret_data)
+{
+        struct dlm_query_region *qr;
+        struct dlm_ctxt *dlm = NULL;
+        int status = 0;
+        int locked = 0;
+        qr = (struct dlm_query_region *) msg->buf;
+        mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
+             qr->qr_domain);
+        status = -EINVAL;
+        spin_lock(&dlm_domain_lock);
+        dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
+        if (!dlm) {
+                mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+                     "before join domain\n", qr->qr_node, qr->qr_domain);
+                goto bail;
+        }
+        spin_lock(&dlm->spinlock);
+        locked = 1;
+        if (dlm->joining_node != qr->qr_node) {
+                mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+                     "but joining node is %d\n", qr->qr_node, qr->qr_domain,
+                     dlm->joining_node);
+                goto bail;
+        }
+        /* Support for global heartbeat was added in 1.1 */
+        if (dlm->dlm_locking_proto.pv_major == 1 &&
+            dlm->dlm_locking_proto.pv_minor == 0) {
+                mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+                     "but active dlm protocol is %d.%d\n", qr->qr_node,
+                     qr->qr_domain, dlm->dlm_locking_proto.pv_major,
+                     dlm->dlm_locking_proto.pv_minor);
+                goto bail;
+        }
+        status = dlm_match_regions(dlm, qr);
+bail:
+        if (locked)
+                spin_unlock(&dlm->spinlock);
+        spin_unlock(&dlm_domain_lock);
+        return status;
+}
+static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
+{
+        struct o2nm_node *local;
+        struct dlm_node_info *remote;
+        int i, j;
+        int status = 0;
+        for (j = 0; j < qn->qn_numnodes; ++j)
+                mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
+                     &(qn->qn_nodes[j].ni_ipv4_address),
+                     ntohs(qn->qn_nodes[j].ni_ipv4_port));
+        for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
+                local = o2nm_get_node_by_num(i);
+                remote = NULL;
+                for (j = 0; j < qn->qn_numnodes; ++j) {
+                        if (qn->qn_nodes[j].ni_nodenum == i) {
+                                remote = &(qn->qn_nodes[j]);
+                                break;
+                        }
+                }
+                if (!local && !remote)
+                        continue;
+                if ((local && !remote) || (!local && remote))
+                        status = -EINVAL;
+                if (!status &&
+                    ((remote->ni_nodenum != local->nd_num) ||
+                     (remote->ni_ipv4_port != local->nd_ipv4_port) ||
+                     (remote->ni_ipv4_address != local->nd_ipv4_address)))
+                        status = -EINVAL;
+                if (status) {
+                        if (remote && !local)
+                                mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+                                     "registered in joining node %d but not in "
+                                     "local node %d\n", qn->qn_domain,
+                                     remote->ni_nodenum,
+                                     &(remote->ni_ipv4_address),
+                                     ntohs(remote->ni_ipv4_port),
+                                     qn->qn_nodenum, dlm->node_num);
+                        if (local && !remote)
+                                mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+                                     "registered in local node %d but not in "
+                                     "joining node %d\n", qn->qn_domain,
+                                     local->nd_num, &(local->nd_ipv4_address),
+                                     ntohs(local->nd_ipv4_port),
+                                     dlm->node_num, qn->qn_nodenum);
+                        BUG_ON((!local && !remote));
+                }
+                if (local)
+                        o2nm_node_put(local);
+        }
+        return status;
+}
+static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+        struct dlm_query_nodeinfo *qn = NULL;
+        struct o2nm_node *node;
+        int ret = 0, status, count, i;
+        if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+                goto bail;
+        qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
+        if (!qn) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto bail;
+        }
+        for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
+                node = o2nm_get_node_by_num(i);
+                if (!node)
+                        continue;
+                qn->qn_nodes[count].ni_nodenum = node->nd_num;
+                qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
+                qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
+                mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
+                     &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
+                ++count;
+                o2nm_node_put(node);
+        }
+        qn->qn_nodenum = dlm->node_num;
+        qn->qn_numnodes = count;
+        qn->qn_namelen = strlen(dlm->name);
+        memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
+        i = -1;
+        while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+                                  i + 1)) < O2NM_MAX_NODES) {
+                if (i == dlm->node_num)
+                        continue;
+                mlog(0, "Sending nodeinfo to node %d\n", i);
+                ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+                                         qn, sizeof(struct dlm_query_nodeinfo),
+                                         i, &status);
+                if (ret >= 0)
+                        ret = status;
+                if (ret) {
+                        mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
+                        break;
+                }
+        }
+bail:
+        kfree(qn);
+        return ret;
+}
+static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
+                                      void *data, void **ret_data)
+{
+        struct dlm_query_nodeinfo *qn;
+        struct dlm_ctxt *dlm = NULL;
+        int locked = 0, status = -EINVAL;
+        qn = (struct dlm_query_nodeinfo *) msg->buf;
+        mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
+             qn->qn_domain);
+        spin_lock(&dlm_domain_lock);
+        dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
+        if (!dlm) {
+                mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
+                     "join domain\n", qn->qn_nodenum, qn->qn_domain);
+                goto bail;
+        }
+        spin_lock(&dlm->spinlock);
+        locked = 1;
+        if (dlm->joining_node != qn->qn_nodenum) {
+                mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
+                     "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
+                     dlm->joining_node);
+                goto bail;
+        }
+        /* Support for node query was added in 1.1 */
+        if (dlm->dlm_locking_proto.pv_major == 1 &&
+            dlm->dlm_locking_proto.pv_minor == 0) {
+                mlog(ML_ERROR, "Node %d queried nodes on domain %s "
+                     "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
+                     qn->qn_domain, dlm->dlm_locking_proto.pv_major,
+                     dlm->dlm_locking_proto.pv_minor);
+                goto bail;
+        }
+        status = dlm_match_nodes(dlm, qn);
+bail:
+        if (locked)
+                spin_unlock(&dlm->spinlock);
+        spin_unlock(&dlm_domain_lock);
+        return status;
+}
 static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
                                   void **ret_data)
 {
@@ -1241,6 +1609,20 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
        set_bit(dlm->node_num, dlm->domain_map);
        spin_unlock(&dlm->spinlock);
+        /* Support for global heartbeat and node info was added in 1.1 */
+        if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
+                status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                status = dlm_send_regions(dlm, ctxt->yes_resp_map);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
        dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
        /* Joined state *must* be set before the joining node
@@ -1277,8 +1659,8 @@ bail:
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
 {
-        o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
+        o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
-        o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
+        o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
        o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
 }
@@ -1290,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
                            dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
-        status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
+        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
        if (status)
                goto bail;
        o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
                            dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
-        status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
+        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
        if (status)
                goto bail;
@@ -1807,7 +2189,21 @@ static int dlm_register_net_handlers(void)
                                        sizeof(struct dlm_cancel_join),
                                        dlm_cancel_join_handler,
                                        NULL, NULL, &dlm_join_handlers);
+        if (status)
+                goto bail;
+        status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
+                                        sizeof(struct dlm_query_region),
+                                        dlm_query_region_handler,
+                                        NULL, NULL, &dlm_join_handlers);
+        if (status)
+                goto bail;
+        status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+                                        sizeof(struct dlm_query_nodeinfo),
+                                        dlm_query_nodeinfo_handler,
+                                        NULL, NULL, &dlm_join_handlers);
 bail:
        if (status < 0)
                dlm_unregister_net_handlers();
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 69cf369961c4..7009292aac5a 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
                if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
                        return 0;
+                if (!dlm_lock_compatible(tmplock->ml.convert_type,
+                                         lock->ml.type))
+                        return 0;
        }
        return 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f564b0e5f80d..59f0f6bdfc62 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
 */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
                                      struct dlm_lock_resource *res,
-                                      int *numlocks)
+                                      int *numlocks,
+                                      int *hasrefs)
 {
        int ret;
        int i;
@@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
+        *numlocks = 0;
+        *hasrefs = 0;
        ret = -EINVAL;
        if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                mlog(0, "cannot migrate lockres with unknown owner!\n");
@@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        }
        *numlocks = count;
-        mlog(0, "migrateable lockres having %d locks\n", *numlocks);
+        count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+        if (count < O2NM_MAX_NODES)
+                *hasrefs = 1;
+        mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
+             res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
 leave:
        return ret;
@@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        const char *name;
        unsigned int namelen;
        int mle_added = 0;
-        int numlocks;
+        int numlocks, hasrefs;
        int wake = 0;
        if (!dlm_grab(dlm))
@@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        name = res->lockname.name;
        namelen = res->lockname.len;
-        mlog(0, "migrating %.*s to %u\n", namelen, name, target);
+        mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
        /*
         * ensure this lockres is a proper candidate for migration
         */
        spin_lock(&res->spinlock);
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
        if (ret < 0) {
                spin_unlock(&res->spinlock);
                goto leave;
@@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        spin_unlock(&res->spinlock);
        /* no work to do */
-        if (numlocks == 0) {
+        if (numlocks == 0 && !hasrefs)
-                mlog(0, "no locks were found on this lockres! done!\n");
                goto leave;
-        }
        /*
         * preallocate up front
@@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
         * find a node to migrate the lockres to
         */
-        mlog(0, "picking a migration node\n");
        spin_lock(&dlm->spinlock);
        /* pick a new node */
        if (!test_bit(target, dlm->domain_map) ||
            target >= O2NM_MAX_NODES) {
                target = dlm_pick_migration_target(dlm, res);
        }
-        mlog(0, "node %u chosen for migration\n", target);
+        mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
+             namelen, name, target);
        if (target >= O2NM_MAX_NODES ||
            !test_bit(target, dlm->domain_map)) {
@@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
        int ret;
        int lock_dropped = 0;
-        int numlocks;
+        int numlocks, hasrefs;
        spin_lock(&res->spinlock);
        if (res->owner != dlm->node_num) {
@@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        }
        /* No need to migrate a lockres having no locks */
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-        if (ret >= 0 && numlocks == 0) {
+        if (ret >= 0 && numlocks == 0 && !hasrefs) {
                spin_unlock(&res->spinlock);
                goto leave;
        }
@@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
                }
                queue++;
        }
+        nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+        if (nodenum < O2NM_MAX_NODES) {
+                spin_unlock(&res->spinlock);
+                return nodenum;
+        }
        spin_unlock(&res->spinlock);
        mlog(0, "have not found a suitable target yet! checking domain map\n");
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2211acf33d9b..1d6d1d22c471 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
 void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                              struct dlm_lock_resource *res)
 {
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
        if (__dlm_lockres_unused(res)){
                if (list_empty(&res->purge)) {
-                        mlog(0, "putting lockres %.*s:%p onto purge list\n",
+                        mlog(0, "%s: Adding res %.*s to purge list\n",
-                             res->lockname.len, res->lockname.name, res);
+                             dlm->name, res->lockname.len, res->lockname.name);
                        res->last_used = jiffies;
                        dlm_lockres_get(res);
@@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                        dlm->purge_count++;
                }
        } else if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
+                mlog(0, "%s: Removing res %.*s from purge list\n",
-                     res->lockname.len, res->lockname.name, res, res->owner);
+                     dlm->name, res->lockname.len, res->lockname.name);
                list_del_init(&res->purge);
                dlm_lockres_put(res);
@@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                            struct dlm_lock_resource *res)
 {
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
        spin_lock(&dlm->spinlock);
        spin_lock(&res->spinlock);
@@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
        master = (res->owner == dlm->node_num);
+        mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
-        mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
+             res->lockname.len, res->lockname.name, master);
-             res->lockname.name, master);
        if (!master) {
                res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                /* clear our bit from the master's refmap, ignore errors */
                ret = dlm_drop_lockres_ref(dlm, res);
                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
+                             res->lockname.len, res->lockname.name, ret);
                        if (!dlm_is_host_down(ret))
                                BUG();
                }
-                mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
-                     dlm->name, res->lockname.len, res->lockname.name, ret);
                spin_lock(&dlm->spinlock);
                spin_lock(&res->spinlock);
        }
        if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s:%p from purgelist, "
+                mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
-                     "master = %d\n", res->lockname.len, res->lockname.name,
+                     dlm->name, res->lockname.len, res->lockname.name, master);
-                     res, master);
                list_del_init(&res->purge);
                dlm_lockres_put(res);
                dlm->purge_count--;
        }
        if (!__dlm_lockres_unused(res)) {
-                mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+                mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
                     dlm->name, res->lockname.len, res->lockname.name);
                __dlm_print_one_lock_resource(res);
                BUG();
@@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                unused = __dlm_lockres_unused(lockres);
                if (!unused ||
                    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
-                        mlog(0, "lockres %s:%.*s: is in use or "
+                        mlog(0, "%s: res %.*s is in use or being remastered, "
-                             "being remastered, used %d, state %d\n",
+                             "used %d, state %d\n", dlm->name,
-                             dlm->name, lockres->lockname.len,
+                             lockres->lockname.len, lockres->lockname.name,
-                             lockres->lockname.name, !unused, lockres->state);
+                             !unused, lockres->state);
                        list_move_tail(&dlm->purge_list, &lockres->purge);
                        spin_unlock(&lockres->spinlock);
                        continue;
@@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
        struct list_head *head;
        int can_grant = 1;
-        //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
+        /*
-        //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
+         * Because this function is called with the lockres
-        //mlog(0, "shuffle res %.*s\n", res->lockname.len,
-        //        res->lockname.name);
-        /* because this function is called with the lockres
         * spinlock, and because we know that it is not migrating/
         * recovering/in-progress, it is fine to reserve asts and
-         * basts right before queueing them all throughout */
+         * basts right before queueing them all throughout
+         */
        assert_spin_locked(&dlm->ast_lock);
        assert_spin_locked(&res->spinlock);
        BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 converting:
        if (list_empty(&res->converting))
                goto blocked;
-        mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
+        mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
-             res->lockname.name);
+             res->lockname.len, res->lockname.name);
        target = list_entry(res->converting.next, struct dlm_lock, list);
        if (target->ml.convert_type == LKM_IVMODE) {
-                mlog(ML_ERROR, "%.*s: converting a lock with no "
+                mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
-                     "convert_type!\n", res->lockname.len, res->lockname.name);
+                     dlm->name, res->lockname.len, res->lockname.name);
                BUG();
        }
        head = &res->granted;
@@ -365,9 +356,12 @@ converting:
                spin_lock(&target->spinlock);
                BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
-                mlog(0, "calling ast for converting lock: %.*s, have: %d, "
+                mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
-                     "granting: %d, node: %u\n", res->lockname.len,
+                     "%d => %d, node %u\n", dlm->name, res->lockname.len,
-                     res->lockname.name, target->ml.type,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+                     target->ml.type,
                     target->ml.convert_type, target->ml.node);
                target->ml.type = target->ml.convert_type;
@@ -428,11 +422,14 @@ blocked:
                spin_lock(&target->spinlock);
                BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
-                mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
+                mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
-                     "node: %u\n", res->lockname.len, res->lockname.name,
+                     "node %u\n", dlm->name, res->lockname.len,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
                     target->ml.type, target->ml.node);
-                // target->ml.type is already correct
+                /* target->ml.type is already correct */
                list_move_tail(&target->list, &res->granted);
                BUG_ON(!target->lksb);
@@ -453,7 +450,6 @@ leave:
 /* must have NO locks when calling this with res !=NULL * */
 void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        mlog_entry("dlm=%p, res=%p\n", dlm, res);
        if (res) {
                spin_lock(&dlm->spinlock);
                spin_lock(&res->spinlock);
@@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        mlog_entry("dlm=%p, res=%p\n", dlm, res);
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
@@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
                        res->state |= DLM_LOCK_RES_DIRTY;
                }
        }
+        mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
 }
 /* Launch the NM thread for the mounted volume */
 int dlm_launch_thread(struct dlm_ctxt *dlm)
 {
-        mlog(0, "starting dlm thread...\n");
+        mlog(0, "Starting dlm_thread...\n");
        dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
        if (IS_ERR(dlm->dlm_thread_task)) {
@@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
 void dlm_complete_thread(struct dlm_ctxt *dlm)
 {
        if (dlm->dlm_thread_task) {
-                mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+                mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
                kthread_stop(dlm->dlm_thread_task);
                dlm->dlm_thread_task = NULL;
        }
@@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* get an extra ref on lock */
                dlm_lock_get(lock);
                res = lock->lockres;
-                mlog(0, "delivering an ast for this lockres\n");
+                mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+                     "node %u\n", dlm->name, res->lockname.len,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     lock->ml.type, lock->ml.node);
                BUG_ON(!lock->ast_pending);
@@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* possible that another ast was queued while
                 * we were delivering the last one */
                if (!list_empty(&lock->ast_list)) {
-                        mlog(0, "aha another ast got queued while "
+                        mlog(0, "%s: res %.*s, AST queued while flushing last "
-                             "we were finishing the last one.  will "
+                             "one\n", dlm->name, res->lockname.len,
-                             "keep the ast_pending flag set.\n");
+                             res->lockname.name);
                } else
                        lock->ast_pending = 0;
@@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                dlm_lock_put(lock);
                spin_unlock(&dlm->ast_lock);
-                mlog(0, "delivering a bast for this lockres "
+                mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
-                     "(blocked = %d\n", hi);
+                     "blocked %d, node %u\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     hi, lock->ml.node);
                if (lock->ml.node != dlm->node_num) {
                        ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* possible that another bast was queued while
                 * we were delivering the last one */
                if (!list_empty(&lock->bast_list)) {
-                        mlog(0, "aha another bast got queued while "
+                        mlog(0, "%s: res %.*s, BAST queued while flushing last "
-                             "we were finishing the last one.  will "
+                             "one\n", dlm->name, res->lockname.len,
-                             "keep the bast_pending flag set.\n");
+                             res->lockname.name);
                } else
                        lock->bast_pending = 0;
@@ -675,11 +681,12 @@ static int dlm_thread(void *data)
                        spin_lock(&res->spinlock);
                        if (res->owner != dlm->node_num) {
                                __dlm_print_one_lock_resource(res);
-                                mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
+                                mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
-                                     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
+                                     " dirty %d\n", dlm->name,
-                                     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
+                                     !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
-                                     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
+                                     !!(res->state & DLM_LOCK_RES_MIGRATING),
-                                     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+                                     !!(res->state & DLM_LOCK_RES_RECOVERING),
+                                     !!(res->state & DLM_LOCK_RES_DIRTY));
                        }
                        BUG_ON(res->owner != dlm->node_num);
@@ -693,8 +700,8 @@ static int dlm_thread(void *data)
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
                                spin_unlock(&dlm->ast_lock);
-                                mlog(0, "delaying list shuffling for in-"
+                                mlog(0, "%s: res %.*s, inprogress, delay list "
-                                     "progress lockres %.*s, state=%d\n",
+                                     "shuffle, state %d\n", dlm->name,
                                     res->lockname.len, res->lockname.name,
                                     res->state);
                                delay = 1;
@@ -706,10 +713,6 @@ static int dlm_thread(void *data)
                         * spinlock and do NOT have the dlm lock.
                         * safe to reserve/queue asts and run the lists. */
-                        mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
-                             "res=%.*s\n", dlm->name,
-                             res->lockname.len, res->lockname.name);
                        /* called while holding lockres lock */
                        dlm_shuffle_lists(dlm, res);
                        res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -733,7 +736,8 @@ in_progress:
                        /* unlikely, but we may need to give time to
                         * other tasks */
                        if (!--n) {
-                                mlog(0, "throttling dlm_thread\n");
+                                mlog(0, "%s: Throttling dlm thread\n",
+                                     dlm->name);
                                break;
                        }
                }
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index c2903b84bb7a..8c5c0eddc365 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -351,11 +351,18 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
        return &ip->ip_vfs_inode;
 }
-static void dlmfs_destroy_inode(struct inode *inode)
+static void dlmfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 }
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
 static void dlmfs_evict_inode(struct inode *inode)
 {
        int status;
@@ -400,6 +407,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
        if (inode) {
                ip = DLMFS_I(inode);
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
@@ -425,6 +433,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        if (!inode)
                return NULL;
+        inode->i_ino = get_next_ino();
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
@@ -612,6 +621,7 @@ static const struct file_operations dlmfs_file_operations = {
        .poll           = dlmfs_file_poll,
        .read           = dlmfs_file_read,
        .write          = dlmfs_file_write,
+        .llseek         = default_llseek,
 };
 static const struct inode_operations dlmfs_dir_inode_operations = {
@@ -640,16 +650,16 @@ static const struct inode_operations dlmfs_file_inode_operations = {
        .setattr        = dlmfs_file_setattr,
 };
-static int dlmfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
 }
 static struct file_system_type dlmfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ocfs2_dlmfs",
-        .get_sb         = dlmfs_get_sb,
+        .mount          = dlmfs_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 5e02a893f46e..e8d94d722ecb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
 {
        struct inode *inode;
        struct address_space *mapping;
+        struct ocfs2_inode_info *oi;
        inode = ocfs2_lock_res_inode(lockres);
        mapping = inode->i_mapping;
+        if (S_ISDIR(inode->i_mode)) {
+                oi = OCFS2_I(inode);
+                oi->ip_dir_lock_gen++;
+                mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+                goto out;
+        }
        if (!S_ISREG(inode->i_mode))
                goto out;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 19ad145d2af3..5dbc3062b4fd 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -137,9 +137,7 @@ check_gen:
        }
        result = d_obtain_alias(inode);
-        if (!IS_ERR(result))
+        if (IS_ERR(result))
-                result->d_op = &ocfs2_dentry_ops;
-        else
                mlog_errno(PTR_ERR(result));
 bail:
@@ -175,8 +173,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        }
        parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
-        if (!IS_ERR(parent))
-                parent->d_op = &ocfs2_dentry_ops;
 bail_unlock:
        ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9a03c151b5ce..a6651956482e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -64,12 +64,6 @@
 #include "buffer_head_io.h"
-static int ocfs2_sync_inode(struct inode *inode)
-{
-        filemap_fdatawrite(inode->i_mapping);
-        return sync_mapping_buffers(inode->i_mapping);
-}
 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
 {
        struct ocfs2_file_private *fp;
@@ -180,16 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 {
        int err = 0;
        journal_t *journal;
-        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = file->f_mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
+        mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
-                   dentry->d_name.len, dentry->d_name.name);
+                   file->f_path.dentry, file->f_path.dentry->d_name.len,
+                   file->f_path.dentry->d_name.name);
-        err = ocfs2_sync_inode(dentry->d_inode);
-        if (err)
-                goto bail;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
                /*
@@ -197,8 +187,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
                 * platter
                 */
                if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
-                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                                           NULL, BLKDEV_IFL_WAIT);
                goto bail;
        }
@@ -370,7 +359,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
        if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
                goto out;
-        return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+        return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
 out:
        return status;
@@ -807,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
                block_end = block_start + (1 << inode->i_blkbits);
                /*
-                 * block_start is block-aligned.  Bump it by one to
+                 * block_start is block-aligned.  Bump it by one to force
-                 * force ocfs2_{prepare,commit}_write() to zero the
+                 * __block_write_begin and block_commit_write to zero the
                 * whole block.
                 */
-                ret = ocfs2_prepare_write_nolock(inode, page,
+                ret = __block_write_begin(page, block_start + 1, 0,
-                                                 block_start + 1,
+                                          ocfs2_get_block);
-                                                 block_start + 1);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out_unlock;
@@ -913,8 +901,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
                zero_clusters = last_cpos - zero_cpos;
        if (needs_cow) {
-                rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
+                rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
-                                        UINT_MAX);
+                                        zero_clusters, UINT_MAX);
                if (rc) {
                        mlog_errno(rc);
                        goto out;
@@ -1319,10 +1307,13 @@ bail:
        return err;
 }
-int ocfs2_permission(struct inode *inode, int mask)
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int ret;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        mlog_entry_void();
        ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1332,7 +1323,7 @@ int ocfs2_permission(struct inode *inode, int mask)
                goto out;
        }
-        ret = generic_permission(inode, mask, ocfs2_check_acl);
+        ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
        ocfs2_inode_unlock(inode, 0);
 out:
@@ -1998,28 +1989,32 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
        return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
 }
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
                            loff_t len)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_space_resv sr;
        int change_size = 1;
+        int cmd = OCFS2_IOC_RESVSP64;
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
        if (!ocfs2_writes_unwritten_extents(osb))
                return -EOPNOTSUPP;
-        if (S_ISDIR(inode->i_mode))
-                return -ENODEV;
        if (mode & FALLOC_FL_KEEP_SIZE)
                change_size = 0;
+        if (mode & FALLOC_FL_PUNCH_HOLE)
+                cmd = OCFS2_IOC_UNRESVSP64;
        sr.l_whence = 0;
        sr.l_start = (s64)offset;
        sr.l_len = (s64)len;
-        return __ocfs2_change_file_space(NULL, inode, offset,
+        return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
-                                         OCFS2_IOC_RESVSP64, &sr, change_size);
+                                         change_size);
 }
 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
@@ -2062,6 +2057,7 @@ out:
 }
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+                                            struct file *file,
                                            loff_t pos, size_t count,
                                            int *meta_level)
 {
@@ -2079,7 +2075,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
        *meta_level = 1;
-        ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+        ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
        if (ret)
                mlog_errno(ret);
 out:
@@ -2087,7 +2083,7 @@ out:
        return ret;
 }
-static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+static int ocfs2_prepare_inode_for_write(struct file *file,
                                         loff_t *ppos,
                                         size_t count,
                                         int appending,
@@ -2095,6 +2091,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         int *has_refcount)
 {
        int ret = 0, meta_level = 0;
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        loff_t saved_pos, end;
@@ -2150,6 +2147,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                        meta_level = -1;
                        ret = ocfs2_prepare_inode_for_refcount(inode,
+                                                               file,
                                                               saved_pos,
                                                               count,
                                                               &meta_level);
@@ -2232,6 +2230,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int full_coherency = !(osb->s_mount_opt &
+                               OCFS2_MOUNT_COHERENCY_BUFFERED);
        mlog_entry("(0x%p, %u, '%.*s')\n", file,
                   (unsigned int)nr_segs,
@@ -2248,23 +2248,50 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        mutex_lock(&inode->i_mutex);
+        ocfs2_iocb_clear_sem_locked(iocb);
 relock:
        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
        if (direct_io) {
                down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+                /* communicate with ocfs2_dio_end_io */
+                ocfs2_iocb_set_sem_locked(iocb);
        }
-        /* concurrent O_DIRECT writes are allowed */
+        /*
-        rw_level = !direct_io;
+         * Concurrent O_DIRECT writes are allowed with
+         * mount_option "coherency=buffered".
+         */
+        rw_level = (!direct_io || full_coherency);
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_sems;
        }
+        /*
+         * O_DIRECT writes with "coherency=full" need to take EX cluster
+         * inode_lock to guarantee coherency.
+         */
+        if (direct_io && full_coherency) {
+                /*
+                 * We need to take and drop the inode lock to force
+                 * other nodes to drop their caches.  Buffered I/O
+                 * already does this in write_begin().
+                 */
+                ret = ocfs2_inode_lock(inode, NULL, 1);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out_sems;
+                }
+                ocfs2_inode_unlock(inode, 1);
+        }
        can_do_direct = direct_io;
-        ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+        ret = ocfs2_prepare_inode_for_write(file, ppos,
                                            iocb->ki_left, appending,
                                            &can_do_direct, &has_refcount);
        if (ret < 0) {
@@ -2312,17 +2339,6 @@ relock:
                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
                                                    ppos, count, ocount);
                if (written < 0) {
-                        /*
-                         * direct write may have instantiated a few
-                         * blocks outside i_size. Trim these off again.
-                         * Don't need i_size_read because we hold i_mutex.
-                         *
-                         * XXX(truncate): this looks buggy because ocfs2 did not
-                         * actually implement ->truncate.  Take a look at
-                         * the new truncate sequence and update this accordingly
-                         */
-                        if (*ppos + count > inode->i_size)
-                                truncate_setsize(inode, inode->i_size);
                        ret = written;
                        goto out_dio;
                }
@@ -2377,8 +2393,10 @@ out:
                ocfs2_rw_unlock(inode, rw_level);
 out_sems:
-        if (have_alloc_sem)
+        if (have_alloc_sem) {
                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        mutex_unlock(&inode->i_mutex);
@@ -2394,7 +2412,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
 {
        int ret;
-        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
+        ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
                                            sd->total_len, 0, NULL, NULL);
        if (ret < 0) {
                mlog_errno(ret);
@@ -2522,6 +2540,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
                goto bail;
        }
+        ocfs2_iocb_clear_sem_locked(iocb);
        /*
         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
         * need locks to protect pending reads from racing with truncate.
@@ -2529,6 +2549,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        if (filp->f_flags & O_DIRECT) {
                down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+                ocfs2_iocb_set_sem_locked(iocb);
                ret = ocfs2_rw_lock(inode, 0);
                if (ret < 0) {
@@ -2570,8 +2591,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        }
 bail:
-        if (have_alloc_sem)
+        if (have_alloc_sem) {
                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
        mlog_exit(ret);
@@ -2587,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
        .getxattr       = generic_getxattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
-        .fallocate      = ocfs2_fallocate,
        .fiemap         = ocfs2_fiemap,
 };
@@ -2619,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
+        .fallocate      = ocfs2_fallocate,
 };
 const struct file_operations ocfs2_dops = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..f5afbbef6703 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask);
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
 int ocfs2_should_update_atime(struct inode *inode,
                              struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index eece3e05d9d0..4068c6c4c6f6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                    else
                            inode->i_fop = &ocfs2_dops_no_plocks;
                    i_size_write(inode, le64_to_cpu(fe->i_size));
+                    OCFS2_I(inode)->ip_dir_lock_gen = 1;
                    break;
            case S_IFLNK:
                    if (ocfs2_inode_is_fast_symlink(inode))
@@ -433,7 +434,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
         * #1 and #2 can be simply solved by never taking the lock
         * here for system files (which are the only type we read
         * during mount). It's a heavier approach, but our main
-         * concern is user-accesible files anyway.
+         * concern is user-accessible files anyway.
         *
         * #3 works itself out because we'll eventually take the
         * cluster lock before trusting anything anyway.
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 6de5a869db30..1c508b149b3a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
        /* These fields are protected by ip_lock */
        spinlock_t                      ip_lock;
        u32                             ip_open_count;
-        u32                             ip_clusters;
        struct list_head                ip_io_markers;
+        u32                             ip_clusters;
+        u16                             ip_dyn_features;
        struct mutex                    ip_io_mutex;
        u32                             ip_flags; /* see below */
        u32                             ip_attr; /* inode attributes */
-        u16                             ip_dyn_features;
        /* protected by recovery_lock. */
        struct inode                    *ip_next_orphan;
-        u32                             ip_dir_start_lookup;
        struct ocfs2_caching_info       ip_metadata_cache;
        struct ocfs2_extent_map         ip_extent_map;
        struct inode                    vfs_inode;
        struct jbd2_inode               ip_jinode;
+        u32                             ip_dir_start_lookup;
        /* Only valid if the inode is the dir. */
        u32                             ip_last_used_slot;
        u64                             ip_last_used_group;
+        u32                             ip_dir_lock_gen;
        struct ocfs2_alloc_reservation  ip_la_data_resv;
 };
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132cef..7a4868196152 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
 #include <linux/ext2_fs.h>
+#define o2info_from_user(a, b)  \
+                copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b)    \
+                copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+                                        struct ocfs2_info_request __user *req)
+{
+        kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+        (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+#define o2info_set_request_error(a, b) \
+                __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
        int status;
@@ -109,6 +129,328 @@ bail:
        return status;
 }
+int ocfs2_info_handle_blocksize(struct inode *inode,
+                                struct ocfs2_info_request __user *req)
+{
+        int status = -EFAULT;
+        struct ocfs2_info_blocksize oib;
+        if (o2info_from_user(oib, req))
+                goto bail;
+        oib.ib_blocksize = inode->i_sb->s_blocksize;
+        oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        if (o2info_to_user(oib, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(oib, req);
+        return status;
+}
+int ocfs2_info_handle_clustersize(struct inode *inode,
+                                  struct ocfs2_info_request __user *req)
+{
+        int status = -EFAULT;
+        struct ocfs2_info_clustersize oic;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (o2info_from_user(oic, req))
+                goto bail;
+        oic.ic_clustersize = osb->s_clustersize;
+        oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        if (o2info_to_user(oic, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(oic, req);
+        return status;
+}
+int ocfs2_info_handle_maxslots(struct inode *inode,
+                               struct ocfs2_info_request __user *req)
+{
+        int status = -EFAULT;
+        struct ocfs2_info_maxslots oim;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (o2info_from_user(oim, req))
+                goto bail;
+        oim.im_max_slots = osb->max_slots;
+        oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        if (o2info_to_user(oim, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(oim, req);
+        return status;
+}
+int ocfs2_info_handle_label(struct inode *inode,
+                            struct ocfs2_info_request __user *req)
+{
+        int status = -EFAULT;
+        struct ocfs2_info_label oil;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (o2info_from_user(oil, req))
+                goto bail;
+        memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+        oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        if (o2info_to_user(oil, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(oil, req);
+        return status;
+}
+int ocfs2_info_handle_uuid(struct inode *inode,
+                           struct ocfs2_info_request __user *req)
+{
+        int status = -EFAULT;
+        struct ocfs2_info_uuid oiu;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (o2info_from_user(oiu, req))
+                goto bail;
+        memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+        oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        if (o2info_to_user(oiu, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(oiu, req);
+        return status;
+}
+int ocfs2_info_handle_fs_features(struct inode *inode,
+                                  struct ocfs2_info_request __user *req)
+{
+        int status = -EFAULT;
+        struct ocfs2_info_fs_features oif;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (o2info_from_user(oif, req))
+                goto bail;
+        oif.if_compat_features = osb->s_feature_compat;
+        oif.if_incompat_features = osb->s_feature_incompat;
+        oif.if_ro_compat_features = osb->s_feature_ro_compat;
+        oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        if (o2info_to_user(oif, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(oif, req);
+        return status;
+}
+int ocfs2_info_handle_journal_size(struct inode *inode,
+                                   struct ocfs2_info_request __user *req)
+{
+        int status = -EFAULT;
+        struct ocfs2_info_journal_size oij;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (o2info_from_user(oij, req))
+                goto bail;
+        oij.ij_journal_size = osb->journal->j_inode->i_size;
+        oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        if (o2info_to_user(oij, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(oij, req);
+        return status;
+}
+int ocfs2_info_handle_unknown(struct inode *inode,
+                              struct ocfs2_info_request __user *req)
+{
+        int status = -EFAULT;
+        struct ocfs2_info_request oir;
+        if (o2info_from_user(oir, req))
+                goto bail;
+        oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
+        if (o2info_to_user(oir, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(oir, req);
+        return status;
+}
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+int ocfs2_info_handle_request(struct inode *inode,
+                              struct ocfs2_info_request __user *req)
+{
+        int status = -EFAULT;
+        struct ocfs2_info_request oir;
+        if (o2info_from_user(oir, req))
+                goto bail;
+        status = -EINVAL;
+        if (oir.ir_magic != OCFS2_INFO_MAGIC)
+                goto bail;
+        switch (oir.ir_code) {
+        case OCFS2_INFO_BLOCKSIZE:
+                if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+                        status = ocfs2_info_handle_blocksize(inode, req);
+                break;
+        case OCFS2_INFO_CLUSTERSIZE:
+                if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+                        status = ocfs2_info_handle_clustersize(inode, req);
+                break;
+        case OCFS2_INFO_MAXSLOTS:
+                if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+                        status = ocfs2_info_handle_maxslots(inode, req);
+                break;
+        case OCFS2_INFO_LABEL:
+                if (oir.ir_size == sizeof(struct ocfs2_info_label))
+                        status = ocfs2_info_handle_label(inode, req);
+                break;
+        case OCFS2_INFO_UUID:
+                if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+                        status = ocfs2_info_handle_uuid(inode, req);
+                break;
+        case OCFS2_INFO_FS_FEATURES:
+                if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+                        status = ocfs2_info_handle_fs_features(inode, req);
+                break;
+        case OCFS2_INFO_JOURNAL_SIZE:
+                if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+                        status = ocfs2_info_handle_journal_size(inode, req);
+                break;
+        default:
+                status = ocfs2_info_handle_unknown(inode, req);
+                break;
+        }
+bail:
+        return status;
+}
+int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+                          u64 *req_addr, int compat_flag)
+{
+        int status = -EFAULT;
+        u64 __user *bp = NULL;
+        if (compat_flag) {
+#ifdef CONFIG_COMPAT
+                /*
+                 * pointer bp stores the base address of a pointers array,
+                 * which collects all addresses of separate request.
+                 */
+                bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+                BUG();
+#endif
+        } else
+                bp = (u64 __user *)(unsigned long)(info->oi_requests);
+        if (o2info_from_user(*req_addr, bp + idx))
+                goto bail;
+        status = 0;
+bail:
+        return status;
+}
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+                      int compat_flag)
+{
+        int i, status = 0;
+        u64 req_addr;
+        struct ocfs2_info_request __user *reqp;
+        if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+            (!info->oi_requests)) {
+                status = -EINVAL;
+                goto bail;
+        }
+        for (i = 0; i < info->oi_count; i++) {
+                status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+                if (status)
+                        break;
+                reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+                if (!reqp) {
+                        status = -EINVAL;
+                        goto bail;
+                }
+                status = ocfs2_info_handle_request(inode, reqp);
+                if (status)
+                        break;
+        }
+bail:
+        return status;
+}
 long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        struct reflink_arguments args;
        const char *old_path, *new_path;
        bool preserve;
+        struct ocfs2_info info;
        switch (cmd) {
        case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                preserve = (args.preserve != 0);
                return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+        case OCFS2_IOC_INFO:
+                if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+                                   sizeof(struct ocfs2_info)))
+                        return -EFAULT;
+                return ocfs2_info_handle(inode, &info, 0);
        default:
                return -ENOTTY;
        }
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        bool preserve;
        struct reflink_arguments args;
        struct inode *inode = file->f_path.dentry->d_inode;
+        struct ocfs2_info info;
        switch (cmd) {
        case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
                                           compat_ptr(args.new_path), preserve);
+        case OCFS2_IOC_INFO:
+                if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+                                   sizeof(struct ocfs2_info)))
+                        return -EFAULT;
+                return ocfs2_info_handle(inode, &info, 1);
        default:
                return -ENOIOCTLCMD;
        }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9b57c0350ff9..faa2303dbf0a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 {
        int status = 0;
        unsigned int flushed;
-        unsigned long old_id;
        struct ocfs2_journal *journal = NULL;
        mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
                goto finally;
        }
-        old_id = ocfs2_inc_trans_id(journal);
+        ocfs2_inc_trans_id(journal);
        flushed = atomic_read(&journal->j_num_trans);
        atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
        return status;
 }
-/* pass it NULL and it will allocate a new handle object for you.  If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
 handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 {
        journal_t *journal = osb->journal->j_journal;
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
        os = &osb->osb_orphan_scan;
+        mlog(0, "Begin orphan scan\n");
        if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
                goto out;
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 unlock:
        ocfs2_orphan_scan_unlock(osb, seqno);
 out:
+        mlog(0, "Orphan scan completed\n");
        return;
 }
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8e710f..43e56b97f9c0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
        struct buffer_head        *j_bh;      /* Journal disk inode block */
        atomic_t                  j_num_trans; /* Number of transactions
                                                * currently in the system. */
+        spinlock_t                j_lock;
        unsigned long             j_trans_id;
        struct rw_semaphore       j_trans_barrier;
        wait_queue_head_t         j_checkpointed;
-        spinlock_t                j_lock;
+        /* both fields protected by j_lock*/
        struct list_head          j_la_cleanups;
        struct work_struct        j_recovery_work;
 };
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 4c18f4ad93b4..7e32db9c2c99 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
        return ret;
 }
-static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
                                struct page *page)
 {
        int ret;
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        loff_t pos = page_offset(page);
        unsigned int len = PAGE_CACHE_SIZE;
@@ -111,7 +112,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
        if (page->index == last_index)
                len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
-        ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+        ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
                                       &fsdata, di_bh, page);
        if (ret) {
                if (ret != -ENOSPC)
@@ -159,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+        ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a00dda2e4f16..849fb4a2e814 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        spin_unlock(&oi->ip_lock);
 bail_add:
-        dentry->d_op = &ocfs2_dentry_ops;
        ret = d_splice_alias(inode, dentry);
        if (inode) {
@@ -171,7 +170,8 @@ bail_add:
                        ret = ERR_PTR(status);
                        goto bail_unlock;
                }
-        }
+        } else
+                ocfs2_dentry_attach_gen(dentry);
 bail_unlock:
        /* Don't drop the cluster lock until *after* the d_add --
@@ -414,7 +414,6 @@ static int ocfs2_mknod(struct inode *dir,
                mlog_errno(status);
                goto leave;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -741,8 +740,7 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_commit;
        }
-        atomic_inc(&inode->i_count);
+        ihold(inode);
-        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
 out_commit:
@@ -1016,8 +1014,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                 * An error return must mean that no cluster locks
                 * were held on function exit.
                 */
-                if (oi1->ip_blkno != oi2->ip_blkno)
+                if (oi1->ip_blkno != oi2->ip_blkno) {
                        ocfs2_inode_unlock(inode2, 1);
+                        brelse(*bh2);
+                        *bh2 = NULL;
+                }
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1793,7 +1794,6 @@ static int ocfs2_symlink(struct inode *dir,
                mlog_errno(status);
                goto bail;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2458,7 +2458,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                goto out_commit;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
        status = 0;
 out_commit:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a2..51cd6898e7f1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
 struct ocfs2_lock_res {
        void                    *l_priv;
        struct ocfs2_lock_res_ops *l_ops;
-        spinlock_t               l_lock;
        struct list_head         l_blocked_list;
        struct list_head         l_mask_waiters;
-        enum ocfs2_lock_type     l_type;
        unsigned long            l_flags;
        char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
-        int                      l_level;
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-        struct ocfs2_dlm_lksb    l_lksb;
+        signed char              l_level;
+        signed char              l_requested;
+        signed char              l_blocking;
+        /* Data packed - type enum ocfs2_lock_type */
+        unsigned char            l_type;
        /* used from AST/BAST funcs. */
-        enum ocfs2_ast_action    l_action;
+        /* Data packed - enum type ocfs2_ast_action */
-        enum ocfs2_unlock_action l_unlock_action;
+        unsigned char            l_action;
-        int                      l_requested;
+        /* Data packed - enum type ocfs2_unlock_action */
-        int                      l_blocking;
+        unsigned char            l_unlock_action;
        unsigned int             l_pending_gen;
+        spinlock_t               l_lock;
+        struct ocfs2_dlm_lksb    l_lksb;
        wait_queue_head_t        l_event;
        struct list_head         l_debug_list;
@@ -243,7 +250,7 @@ enum ocfs2_local_alloc_state
 enum ocfs2_mount_options
 {
-        OCFS2_MOUNT_HB_LOCAL   = 1 << 0, /* Heartbeat started in local mode */
+        OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
        OCFS2_MOUNT_BARRIER = 1 << 1,   /* Use block barriers */
        OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
        OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -256,6 +263,10 @@ enum ocfs2_mount_options
                                                   control lists */
        OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
        OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+        OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+                                                     writes */
+        OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+        OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
 };
 #define OCFS2_OSB_SOFT_RO                       0x0001
@@ -277,7 +288,8 @@ struct ocfs2_super
        struct super_block *sb;
        struct inode *root_inode;
        struct inode *sys_root_inode;
-        struct inode *system_inodes[NUM_SYSTEM_INODES];
+        struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+        struct inode **local_system_inodes;
        struct ocfs2_slot_info *slot_info;
@@ -368,6 +380,8 @@ struct ocfs2_super
        struct ocfs2_alloc_stats alloc_stats;
        char dev_str[20];               /* "major,minor" of the device */
+        u8 osb_stackflags;
        char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
        struct ocfs2_cluster_connection *cconn;
        struct ocfs2_lock_res osb_super_lockres;
@@ -406,6 +420,11 @@ struct ocfs2_super
        struct inode                    *osb_tl_inode;
        struct buffer_head              *osb_tl_bh;
        struct delayed_work             osb_truncate_log_wq;
+        /*
+         * How many clusters in our truncate log.
+         * It must be protected by osb_tl_inode->i_mutex.
+         */
+        unsigned int truncated_clusters;
        struct ocfs2_node_map           osb_recovering_orphan_dirs;
        unsigned int                    *osb_orphan_wipes;
@@ -601,10 +620,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
        return ret;
 }
-static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
 {
        return (osb->s_feature_incompat &
-                OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+                (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
+                 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
+}
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+        if (ocfs2_clusterinfo_valid(osb) &&
+            memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+                   OCFS2_STACK_LABEL_LEN))
+                return 1;
+        return 0;
+}
+static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
+{
+        if (ocfs2_clusterinfo_valid(osb) &&
+            !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+                   OCFS2_STACK_LABEL_LEN))
+                return 1;
+        return 0;
+}
+static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
+{
+        return ocfs2_o2cb_stack(osb) &&
+                (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
 }
 static inline int ocfs2_mount_local(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index fa31d05e41b7..bf2e7764920e 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -101,7 +101,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_META_ECC \
                                         | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
                                         | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
-                                         | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+                                         | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG  \
+                                         | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -170,6 +171,13 @@
 #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG     0x2000
 /*
+ * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * cluster stacks (userspace adnd o2cb). If this bit is set,
+ * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
+ */
+#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO      0x4000
+/*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
 */
@@ -292,10 +300,13 @@
 #define OCFS2_VOL_UUID_LEN              16
 #define OCFS2_MAX_VOL_LABEL_LEN         64
-/* The alternate, userspace stack fields */
+/* The cluster stack fields */
 #define OCFS2_STACK_LABEL_LEN           4
 #define OCFS2_CLUSTER_NAME_LEN          16
+/* Classic (historically speaking) cluster stack */
+#define OCFS2_CLASSIC_CLUSTER_STACK     "o2cb"
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE          (4 * 1024 * 1024)
@@ -305,6 +316,11 @@
 */
 #define OCFS2_MIN_XATTR_INLINE_SIZE     256
+/*
+ * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
+ */
+#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT     (0x01)
 struct ocfs2_system_inode_info {
        char    *si_name;
        int     si_iflags;
@@ -322,6 +338,7 @@ enum {
        USER_QUOTA_SYSTEM_INODE,
        GROUP_QUOTA_SYSTEM_INODE,
 #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
        ORPHAN_DIR_SYSTEM_INODE,
        EXTENT_ALLOC_SYSTEM_INODE,
        INODE_ALLOC_SYSTEM_INODE,
@@ -330,8 +347,12 @@ enum {
        TRUNCATE_LOG_SYSTEM_INODE,
        LOCAL_USER_QUOTA_SYSTEM_INODE,
        LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
        NUM_SYSTEM_INODES
 };
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES \
+                (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
 static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
        /* Global system inodes (single copy) */
@@ -360,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 /* Parameter passed from mount.ocfs2 to module */
 #define OCFS2_HB_NONE                   "heartbeat=none"
 #define OCFS2_HB_LOCAL                  "heartbeat=local"
+#define OCFS2_HB_GLOBAL                 "heartbeat=global"
 /*
 * OCFS2 directory file types.  Only the low 3 bits are used.  The
@@ -566,9 +588,21 @@ struct ocfs2_slot_map_extended {
 */
 };
+/*
+ * ci_stackflags is only valid if the incompat bit
+ * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
+ */
 struct ocfs2_cluster_info {
 /*00*/  __u8   ci_stack[OCFS2_STACK_LABEL_LEN];
-        __le32 ci_reserved;
+        union {
+                __le32 ci_reserved;
+                struct {
+                        __u8 ci_stackflags;
+                        __u8 ci_reserved1;
+                        __u8 ci_reserved2;
+                        __u8 ci_reserved3;
+                };
+        };
 /*08*/  __u8   ci_cluster[OCFS2_CLUSTER_NAME_LEN];
 /*18*/
 };
@@ -605,9 +639,9 @@ struct ocfs2_super_block {
                                         * group header */
 /*50*/  __u8  s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
 /*90*/  __u8  s_uuid[OCFS2_VOL_UUID_LEN];       /* 128-bit uuid */
-/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
+/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
-                                                     stack.  Only valid
+                                                     userspace or clusterinfo
-                                                     with INCOMPAT flag. */
+                                                     INCOMPAT flag set. */
 /*B8*/  __le16 s_xattr_inline_size;     /* extended attribute inline size
                                           for this fs*/
        __le16 s_reserved0;
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 5d241505690b..b46f39bf7438 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -76,4 +76,99 @@ struct reflink_arguments {
 };
 #define OCFS2_IOC_REFLINK       _IOW('o', 4, struct reflink_arguments)
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST          (50)
+#define OCFS2_TEXT_UUID_LEN             (OCFS2_VOL_UUID_LEN * 2)
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC                (0x4F32494E)
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+        __u64 oi_requests;      /* Array of __u64 pointers to requests */
+        __u32 oi_count;         /* Number of requests in info_requests */
+        __u32 oi_pad;
+};
+struct ocfs2_info_request {
+/*00*/  __u32 ir_magic; /* Magic number */
+        __u32 ir_code;  /* Info request code */
+        __u32 ir_size;  /* Size of request */
+        __u32 ir_flags; /* Request flags */
+/*10*/                  /* Request specific fields */
+};
+struct ocfs2_info_clustersize {
+        struct ocfs2_info_request ic_req;
+        __u32 ic_clustersize;
+        __u32 ic_pad;
+};
+struct ocfs2_info_blocksize {
+        struct ocfs2_info_request ib_req;
+        __u32 ib_blocksize;
+        __u32 ib_pad;
+};
+struct ocfs2_info_maxslots {
+        struct ocfs2_info_request im_req;
+        __u32 im_max_slots;
+        __u32 im_pad;
+};
+struct ocfs2_info_label {
+        struct ocfs2_info_request il_req;
+        __u8    il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+struct ocfs2_info_uuid {
+        struct ocfs2_info_request iu_req;
+        __u8    iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+struct ocfs2_info_fs_features {
+        struct ocfs2_info_request if_req;
+        __u32 if_compat_features;
+        __u32 if_incompat_features;
+        __u32 if_ro_compat_features;
+        __u32 if_pad;
+};
+struct ocfs2_info_journal_size {
+        struct ocfs2_info_request ij_req;
+        __u64 ij_journal_size;
+};
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+        OCFS2_INFO_CLUSTERSIZE = 1,
+        OCFS2_INFO_BLOCKSIZE,
+        OCFS2_INFO_MAXSLOTS,
+        OCFS2_INFO_LABEL,
+        OCFS2_INFO_UUID,
+        OCFS2_INFO_FS_FEATURES,
+        OCFS2_INFO_JOURNAL_SIZE,
+        OCFS2_INFO_NUM_TYPES
+};
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT      (0x00000001)    /* Cluster coherency not
+                                                           required. This is a hint.
+                                                           It is up to ocfs2 whether
+                                                           the request can be fulfilled
+                                                           without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED            (0x40000000)    /* Filesystem understood
+                                                           this request and
+                                                           filled in the answer */
+#define OCFS2_INFO_FL_ERROR             (0x80000000)    /* Error happened during
+                                                           request handling. */
+#define OCFS2_IOC_INFO          _IOR('o', 5, struct ocfs2_info)
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index efdd75607406..b5f9160e93e9 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
 struct ocfs2_cow_context {
        struct inode *inode;
+        struct file *file;
        u32 cow_start;
        u32 cow_len;
        struct ocfs2_extent_tree data_et;
@@ -2932,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
        struct page *page;
        pgoff_t page_index;
-        unsigned int from, to;
+        unsigned int from, to, readahead_pages;
        loff_t offset, end, map_end;
        struct address_space *mapping = context->inode->i_mapping;
        mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
             new_cluster, new_len, cpos);
+        readahead_pages =
+                (ocfs2_cow_contig_clusters(sb) <<
+                 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
        end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
        /*
@@ -2969,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
                        BUG_ON(PageDirty(page));
+                if (PageReadahead(page) && context->file) {
+                        page_cache_async_readahead(mapping,
+                                                   &context->file->f_ra,
+                                                   context->file,
+                                                   page, page_index,
+                                                   readahead_pages);
+                }
                if (!PageUptodate(page)) {
                        ret = block_read_full_page(page, ocfs2_get_block);
                        if (ret) {
@@ -3409,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
        return ret;
 }
+static void ocfs2_readahead_for_cow(struct inode *inode,
+                                    struct file *file,
+                                    u32 start, u32 len)
+{
+        struct address_space *mapping;
+        pgoff_t index;
+        unsigned long num_pages;
+        int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+        if (!file)
+                return;
+        mapping = file->f_mapping;
+        num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
+        if (!num_pages)
+                num_pages = 1;
+        index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
+        page_cache_sync_readahead(mapping, &file->f_ra, file,
+                                  index, num_pages);
+}
 /*
 * Starting at cpos, try to CoW write_len clusters.  Don't CoW
 * past max_cpos.  This will stop when it runs into a hole or an
 * unrefcounted extent.
 */
 static int ocfs2_refcount_cow_hunk(struct inode *inode,
+                                   struct file *file,
                                   struct buffer_head *di_bh,
                                   u32 cpos, u32 write_len, u32 max_cpos)
 {
@@ -3443,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
        BUG_ON(cow_len == 0);
+        ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
        context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
        if (!context) {
                ret = -ENOMEM;
@@ -3464,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
        context->ref_root_bh = ref_root_bh;
        context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
        context->get_clusters = ocfs2_di_get_clusters;
+        context->file = file;
        ocfs2_init_dinode_extent_tree(&context->data_et,
                                      INODE_CACHE(inode), di_bh);
@@ -3492,6 +3530,7 @@ out:
 * clusters between cpos and cpos+write_len are safe to modify.
 */
 int ocfs2_refcount_cow(struct inode *inode,
+                       struct file *file,
                       struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos)
 {
@@ -3511,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
                        num_clusters = write_len;
                if (ext_flags & OCFS2_EXT_REFCOUNTED) {
-                        ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+                        ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
                                                      num_clusters, max_cpos);
                        if (ret) {
                                mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1570e2..c8ce46f7d8e3 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
        struct rb_node rf_node;
        u64 rf_blkno;
        u32 rf_generation;
+        struct kref rf_getcnt;
        struct rw_semaphore rf_sem;
        struct ocfs2_lock_res rf_lockres;
-        struct kref rf_getcnt;
        int rf_removed;
        /* the following 4 fields are used by caching_info. */
-        struct ocfs2_caching_info rf_ci;
        spinlock_t rf_lock;
+        struct ocfs2_caching_info rf_ci;
        struct mutex rf_io_mutex;
        struct super_block *rf_sb;
 };
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
                                          u32 clusters,
                                          int *credits,
                                          int *ref_blocks);
-int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_refcount_cow(struct inode *inode,
+                       struct file *filep, struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos);
 typedef int (ocfs2_post_refcount_func)(struct inode *inode,
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9e949f..ab4e0172cc1d 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 {
        int status = 0;
        u64 blkno;
-        unsigned long long blocks, bytes;
+        unsigned long long blocks, bytes = 0;
        unsigned int i;
        struct buffer_head *bh;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 0d3049f696c5..19965b00c43c 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
        /* for now we only have one cluster/node, make sure we see it
         * in the heartbeat universe */
        if (!o2hb_check_local_node_heartbeating()) {
+                if (o2hb_global_heartbeat_active())
+                        mlog(ML_ERROR, "Global heartbeat not started\n");
                rc = -EINVAL;
                goto out;
        }
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bca0688..a5ebe421195f 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,7 +22,6 @@
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
@@ -191,7 +190,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
                        return c;
        }
-        return c;
+        return NULL;
 }
 /*
@@ -612,12 +611,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
                return -ENOMEM;
        p->op_this_node = -1;
-        lock_kernel();
        mutex_lock(&ocfs2_control_lock);
        file->private_data = p;
        list_add(&p->op_list, &ocfs2_control_private_list);
        mutex_unlock(&ocfs2_control_lock);
-        unlock_kernel();
        return 0;
 }
@@ -628,6 +625,7 @@ static const struct file_operations ocfs2_control_fops = {
        .read    = ocfs2_control_read,
        .write   = ocfs2_control_write,
        .owner   = THIS_MODULE,
+        .llseek  = default_llseek,
 };
 static struct miscdevice ocfs2_control_device = {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 849c2f0e0a0e..71998d4d61d5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        }
        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                            " count %u but claims %u are freed. num_bits %d",
+                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                            le16_to_cpu(bg->bg_bits),
+                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
+                return -EROFS;
+        }
        while(num_bits--)
                ocfs2_set_bit(bit_off++, bitmap);
@@ -1908,7 +1916,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
        if (res->sr_bg_blkno) {
                /* Attempt to short-circuit the usual search mechanism
                 * by jumping straight to the most recently used
-                 * allocation group. This helps us mantain some
+                 * allocation group. This helps us maintain some
                 * contiguousness across allocations. */
                status = ocfs2_search_one_group(ac, handle, bits_wanted,
                                                min_bits, res, &bits_left);
@@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
                                (unsigned long *) undo_bg->bg_bitmap);
        }
        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                            " count %u but claims %u are freed. num_bits %d",
+                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                            le16_to_cpu(bg->bg_bits),
+                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
+                return -EROFS;
+        }
        if (undo_fn)
                jbd_unlock_bh_state(group_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fa1be1b304d1..38f986d2447e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,7 +41,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
-#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -162,6 +161,7 @@ enum {
        Opt_nointr,
        Opt_hb_none,
        Opt_hb_local,
+        Opt_hb_global,
        Opt_data_ordered,
        Opt_data_writeback,
        Opt_atime_quantum,
@@ -177,6 +177,8 @@ enum {
        Opt_noacl,
        Opt_usrquota,
        Opt_grpquota,
+        Opt_coherency_buffered,
+        Opt_coherency_full,
        Opt_resv_level,
        Opt_dir_resv_level,
        Opt_err,
@@ -190,6 +192,7 @@ static const match_table_t tokens = {
        {Opt_nointr, "nointr"},
        {Opt_hb_none, OCFS2_HB_NONE},
        {Opt_hb_local, OCFS2_HB_LOCAL},
+        {Opt_hb_global, OCFS2_HB_GLOBAL},
        {Opt_data_ordered, "data=ordered"},
        {Opt_data_writeback, "data=writeback"},
        {Opt_atime_quantum, "atime_quantum=%u"},
@@ -205,6 +208,8 @@ static const match_table_t tokens = {
        {Opt_noacl, "noacl"},
        {Opt_usrquota, "usrquota"},
        {Opt_grpquota, "grpquota"},
+        {Opt_coherency_buffered, "coherency=buffered"},
+        {Opt_coherency_full, "coherency=full"},
        {Opt_resv_level, "resv_level=%u"},
        {Opt_dir_resv_level, "dir_resv_level=%u"},
        {Opt_err, NULL}
@@ -514,11 +519,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
        mlog_entry_void();
-        for (i = 0; i < NUM_SYSTEM_INODES; i++) {
+        for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
-                inode = osb->system_inodes[i];
+                inode = osb->global_system_inodes[i];
                if (inode) {
                        iput(inode);
-                        osb->system_inodes[i] = NULL;
+                        osb->global_system_inodes[i] = NULL;
                }
        }
@@ -534,6 +539,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
                osb->root_inode = NULL;
        }
+        if (!osb->local_system_inodes)
+                goto out;
+        for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+                if (osb->local_system_inodes[i]) {
+                        iput(osb->local_system_inodes[i]);
+                        osb->local_system_inodes[i] = NULL;
+                }
+        }
+        kfree(osb->local_system_inodes);
+        osb->local_system_inodes = NULL;
+out:
        mlog_exit(0);
 }
@@ -550,11 +569,18 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
-static void ocfs2_destroy_inode(struct inode *inode)
+static void ocfs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
 }
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
 static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
                                                unsigned int cbits)
 {
@@ -608,8 +634,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        int ret = 0;
        struct mount_options parsed_options;
        struct ocfs2_super *osb = OCFS2_SB(sb);
+        u32 tmp;
-        lock_kernel();
        if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
            !ocfs2_check_set_options(sb, &parsed_options)) {
@@ -617,8 +642,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
                goto out;
        }
-        if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
+        tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
-            (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+                OCFS2_MOUNT_HB_NONE;
+        if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
                ret = -EINVAL;
                mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
                goto out;
@@ -717,7 +743,6 @@ unlock_osb:
                                                        MS_POSIXACL : 0);
        }
 out:
-        unlock_kernel();
        return ret;
 }
@@ -809,23 +834,29 @@ bail:
 static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
 {
-        if (ocfs2_mount_local(osb)) {
+        u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
-                if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+        if (osb->s_mount_opt & hb_enabled) {
+                if (ocfs2_mount_local(osb)) {
                        mlog(ML_ERROR, "Cannot heartbeat on a locally "
                             "mounted device.\n");
                        return -EINVAL;
                }
-        }
+                if (ocfs2_userspace_stack(osb)) {
-        if (ocfs2_userspace_stack(osb)) {
-                if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
                        mlog(ML_ERROR, "Userspace stack expected, but "
                             "o2cb heartbeat arguments passed to mount\n");
                        return -EINVAL;
                }
+                if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
+                     !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
+                    ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
+                     ocfs2_cluster_o2cb_global_heartbeat(osb))) {
+                        mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
+                        return -EINVAL;
+                }
        }
-        if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+        if (!(osb->s_mount_opt & hb_enabled)) {
                if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
                    !ocfs2_userspace_stack(osb)) {
                        mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -962,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 }
 /* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
-                          char *path)
 {
        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -982,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type)
 }
 static const struct quotactl_ops ocfs2_quotactl_ops = {
-        .quota_on       = ocfs2_quota_on,
+        .quota_on_meta  = ocfs2_quota_on,
        .quota_off      = ocfs2_quota_off,
        .quota_sync     = dquot_quota_sync,
        .get_info       = dquot_get_dqinfo,
@@ -1211,14 +1241,12 @@ read_super_error:
        return status;
 }
-static int ocfs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
                        int flags,
                        const char *dev_name,
-                        void *data,
+                        void *data)
-                        struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
-                           mnt);
 }
 static void ocfs2_kill_sb(struct super_block *sb)
@@ -1242,8 +1270,7 @@ out:
 static struct file_system_type ocfs2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ocfs2",
-        .get_sb         = ocfs2_get_sb, /* is this called when we mount
+        .mount          = ocfs2_mount,
-                                        * the fs? */
        .kill_sb        = ocfs2_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
@@ -1291,6 +1318,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 {
        int status;
        char *p;
+        u32 tmp;
        mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
                   options ? options : "(none)");
@@ -1322,7 +1350,10 @@ static int ocfs2_parse_options(struct super_block *sb,
                        mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
                        break;
                case Opt_hb_none:
-                        mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+                        mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
+                        break;
+                case Opt_hb_global:
+                        mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
                        break;
                case Opt_barrier:
                        if (match_int(&args[0], &option)) {
@@ -1438,6 +1469,12 @@ static int ocfs2_parse_options(struct super_block *sb,
                case Opt_grpquota:
                        mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
                        break;
+                case Opt_coherency_buffered:
+                        mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+                        break;
+                case Opt_coherency_full:
+                        mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+                        break;
                case Opt_acl:
                        mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
                        mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1477,6 +1514,15 @@ static int ocfs2_parse_options(struct super_block *sb,
                }
        }
+        /* Ensure only one heartbeat mode */
+        tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+                                 OCFS2_MOUNT_HB_NONE);
+        if (hweight32(tmp) != 1) {
+                mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+                status = 0;
+                goto bail;
+        }
        status = 1;
 bail:
@@ -1490,10 +1536,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        unsigned long opts = osb->s_mount_opt;
        unsigned int local_alloc_megs;
-        if (opts & OCFS2_MOUNT_HB_LOCAL)
+        if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
-                seq_printf(s, ",_netdev,heartbeat=local");
+                seq_printf(s, ",_netdev");
-        else
+                if (opts & OCFS2_MOUNT_HB_LOCAL)
-                seq_printf(s, ",heartbeat=none");
+                        seq_printf(s, ",%s", OCFS2_HB_LOCAL);
+                else
+                        seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
+        } else
+                seq_printf(s, ",%s", OCFS2_HB_NONE);
        if (opts & OCFS2_MOUNT_NOINTR)
                seq_printf(s, ",nointr");
@@ -1536,6 +1586,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_GRPQUOTA)
                seq_printf(s, ",grpquota");
+        if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+                seq_printf(s, ",coherency=buffered");
+        else
+                seq_printf(s, ",coherency=full");
        if (opts & OCFS2_MOUNT_NOUSERXATTR)
                seq_printf(s, ",nouser_xattr");
        else
@@ -1640,13 +1695,9 @@ static void ocfs2_put_super(struct super_block *sb)
 {
        mlog_entry("(0x%p)\n", sb);
-        lock_kernel();
        ocfs2_sync_blockdev(sb);
        ocfs2_dismount_volume(sb, 0);
-        unlock_kernel();
        mlog_exit_void();
 }
@@ -1990,6 +2041,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
        return 0;
 }
+/* Make sure entire volume is addressable by our journal.  Requires
+   osb_clusters_at_boot to be valid and for the journal to have been
+   initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+        int status = 0;
+        u64 max_block =
+                ocfs2_clusters_to_blocks(osb->sb,
+                                         osb->osb_clusters_at_boot) - 1;
+        /* 32-bit block number is always OK. */
+        if (max_block <= (u32)~0ULL)
+                goto out;
+        /* Volume is "huge", so see if our journal is new enough to
+           support it. */
+        if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+                                       OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+              jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+                                               JBD2_FEATURE_INCOMPAT_64BIT))) {
+                mlog(ML_ERROR, "The journal cannot address the entire volume. "
+                     "Enable the 'block64' journal option with tunefs.ocfs2");
+                status = -EFBIG;
+                goto out;
+        }
+ out:
+        return status;
+}
 static int ocfs2_initialize_super(struct super_block *sb,
                                  struct buffer_head *bh,
                                  int sector_size,
@@ -2002,6 +2083,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        struct ocfs2_journal *journal;
        __le32 uuid_net_key;
        struct ocfs2_super *osb;
+        u64 total_blocks;
        mlog_entry_void();
@@ -2014,6 +2096,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
+        sb->s_d_op = &ocfs2_dentry_ops;
        sb->s_export_op = &ocfs2_export_ops;
        sb->s_qcop = &ocfs2_quotactl_ops;
        sb->dq_op = &ocfs2_quota_operations;
@@ -2060,6 +2143,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
        snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
                 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+        osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+        if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+                mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+                     osb->max_slots);
+                status = -EINVAL;
+                goto bail;
+        }
+        mlog(0, "max_slots for this device: %u\n", osb->max_slots);
        ocfs2_orphan_scan_init(osb);
        status = ocfs2_recovery_init(osb);
@@ -2098,15 +2190,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
-        osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
-        if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
-                mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
-                     osb->max_slots);
-                status = -EINVAL;
-                goto bail;
-        }
-        mlog(0, "max_slots for this device: %u\n", osb->max_slots);
        osb->slot_recovery_generations =
                kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
                        GFP_KERNEL);
@@ -2149,7 +2232,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
-        if (ocfs2_userspace_stack(osb)) {
+        if (ocfs2_clusterinfo_valid(osb)) {
+                osb->osb_stackflags =
+                        OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
                memcpy(osb->osb_cluster_stack,
                       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
                       OCFS2_STACK_LABEL_LEN);
@@ -2214,11 +2299,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
-        if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
+        total_blocks = ocfs2_clusters_to_blocks(osb->sb,
-            > (u32)~0UL) {
+                                                le32_to_cpu(di->i_clusters));
-                mlog(ML_ERROR, "Volume might try to write to blocks beyond "
-                     "what jbd can address in 32 bits.\n");
+        status = generic_check_addressable(osb->sb->s_blocksize_bits,
-                status = -EINVAL;
+                                           total_blocks);
+        if (status) {
+                mlog(ML_ERROR, "Volume too large "
+                     "to mount safely on this system");
+                status = -EFBIG;
                goto bail;
        }
@@ -2380,6 +2469,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                goto finally;
        }
+        /* Now that journal has been initialized, check to make sure
+           entire volume is addressable. */
+        status = ocfs2_journal_addressable(osb);
+        if (status)
+                goto finally;
        /* If the journal was unmounted cleanly then we don't want to
         * recover anything. Otherwise, journal_load will do that
         * dirty work for us :) */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bfe7190cdbf1..902efb23b6a6 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
                                                   int type,
                                                   u32 slot);
-static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
-                                           int type,
-                                           u32 slot);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
 #endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
                type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
 }
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
-                                           int type,
+                                             int type,
-                                           u32 slot)
+                                             u32 slot)
 {
-        return slot == osb->slot_num || is_global_system_inode(type);
+        int index;
+        struct inode **local_system_inodes, **free = NULL;
+        BUG_ON(slot == OCFS2_INVALID_SLOT);
+        BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+               type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+        spin_lock(&osb->osb_lock);
+        local_system_inodes = osb->local_system_inodes;
+        spin_unlock(&osb->osb_lock);
+        if (unlikely(!local_system_inodes)) {
+                local_system_inodes = kzalloc(sizeof(struct inode *) *
+                                              NUM_LOCAL_SYSTEM_INODES *
+                                              osb->max_slots,
+                                              GFP_NOFS);
+                if (!local_system_inodes) {
+                        mlog_errno(-ENOMEM);
+                        /*
+                         * return NULL here so that ocfs2_get_sytem_file_inodes
+                         * will try to create an inode and use it. We will try
+                         * to initialize local_system_inodes next time.
+                         */
+                        return NULL;
+                }
+                spin_lock(&osb->osb_lock);
+                if (osb->local_system_inodes) {
+                        /* Someone has initialized it for us. */
+                        free = local_system_inodes;
+                        local_system_inodes = osb->local_system_inodes;
+                } else
+                        osb->local_system_inodes = local_system_inodes;
+                spin_unlock(&osb->osb_lock);
+                if (unlikely(free))
+                        kfree(free);
+        }
+        index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+                (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+        return &local_system_inodes[index];
 }
 struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
        struct inode **arr = NULL;
        /* avoid the lookup if cached in local system file array */
-        if (is_in_system_inode_array(osb, type, slot))
+        if (is_global_system_inode(type)) {
-                arr = &(osb->system_inodes[type]);
+                arr = &(osb->global_system_inodes[type]);
+        } else
+                arr = get_local_system_inode(osb, type, slot);
        if (arr && ((inode = *arr) != NULL)) {
                /* get a ref in addition to the array ref */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 06fa5e77c40e..67cd43914641 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
                goto out;
        }
-        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
+        if (!indexed)
                ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
        else
                ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 14a22863291a..e043c4cb9a97 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -557,17 +557,16 @@ end:
        return ret;
 }
-static int omfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *omfs_mount(struct file_system_type *fs_type,
-                        int flags, const char *dev_name,
+                        int flags, const char *dev_name, void *data)
-                        void *data, struct vfsmount *m)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m);
+        return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
 }
 static struct file_system_type omfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "omfs",
-        .get_sb = omfs_get_sb,
+        .mount = omfs_mount,
        .kill_sb = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/open.c b/fs/open.c
index d74e1983e8dc..e52389e1f05b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                return -EINVAL;
        /* Return error if mode is not supported */
-        if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
+        /* Punch hole must have keep size set */
+        if ((mode & FALLOC_FL_PUNCH_HOLE) &&
+            !(mode & FALLOC_FL_KEEP_SIZE))
                return -EOPNOTSUPP;
        if (!(file->f_mode & FMODE_WRITE))
@@ -250,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
                return -EFBIG;
-        if (!inode->i_op->fallocate)
+        if (!file->f_op->fallocate)
                return -EOPNOTSUPP;
-        return inode->i_op->fallocate(inode, mode, offset, len);
+        return file->f_op->fallocate(file, mode, offset, len);
 }
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
@@ -786,11 +791,11 @@ struct file *nameidata_to_filp(struct nameidata *nd)
        /* Pick up the filp from the open intent */
        filp = nd->intent.open.file;
        /* Has the filesystem initialised the file for us? */
-        if (filp->f_path.dentry == NULL)
+        if (filp->f_path.dentry == NULL) {
+                path_get(&nd->path);
                filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
                                     NULL, cred);
-        else
+        }
-                path_put(&nd->path);
        return filp;
 }
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ffcd04f0012c..a2a5bff774e3 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -343,11 +343,18 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
-static void openprom_destroy_inode(struct inode *inode)
+static void openprom_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(op_inode_cachep, OP_I(inode));
 }
+static void openprom_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, openprom_i_callback);
+}
 static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 {
        struct inode *inode;
@@ -415,16 +422,16 @@ out_no_root:
        return ret;
 }
-static int openprom_get_sb(struct file_system_type *fs_type,
+static struct dentry *openprom_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
+        return mount_single(fs_type, flags, data, openprom_fill_super);
 }
 static struct file_system_type openprom_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "openpromfs",
-        .get_sb         = openprom_get_sb,
+        .mount          = openprom_mount,
        .kill_sb        = kill_anon_super,
 };
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..9c21119512b9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev,
        return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
+ssize_t part_ro_show(struct device *dev,
+                       struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+}
 ssize_t part_alignment_offset_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
 {
@@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
                   NULL);
@@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_partition.attr,
        &dev_attr_start.attr,
        &dev_attr_size.attr,
+        &dev_attr_ro.attr,
        &dev_attr_alignment_offset.attr,
        &dev_attr_discard_alignment.attr,
        &dev_attr_stat.attr,
@@ -352,6 +361,7 @@ static void part_release(struct device *dev)
 {
        struct hd_struct *p = dev_to_part(dev);
        free_part_stats(p);
+        free_part_info(p);
        kfree(p);
 }
@@ -371,6 +381,11 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
        put_device(part_to_dev(part));
 }
+void __delete_partition(struct hd_struct *part)
+{
+        call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
 void delete_partition(struct gendisk *disk, int partno)
 {
        struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -389,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno)
        kobject_put(part->holder_dir);
        device_del(part_to_dev(part));
-        call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+        hd_struct_put(part);
 }
 static ssize_t whole_disk_show(struct device *dev,
@@ -401,7 +416,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
                   whole_disk_show, NULL);
 struct hd_struct *add_partition(struct gendisk *disk, int partno,
-                                sector_t start, sector_t len, int flags)
+                                sector_t start, sector_t len, int flags,
+                                struct partition_meta_info *info)
 {
        struct hd_struct *p;
        dev_t devt = MKDEV(0, 0);
@@ -438,6 +454,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        p->partno = partno;
        p->policy = get_disk_ro(disk);
+        if (info) {
+                struct partition_meta_info *pinfo = alloc_part_info(disk);
+                if (!pinfo)
+                        goto out_free_stats;
+                memcpy(pinfo, info, sizeof(*info));
+                p->info = pinfo;
+        }
        dname = dev_name(ddev);
        if (isdigit(dname[strlen(dname) - 1]))
                dev_set_name(pdev, "%sp%d", dname, partno);
@@ -451,7 +475,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        err = blk_alloc_devt(p, &devt);
        if (err)
-                goto out_free_stats;
+                goto out_free_info;
        pdev->devt = devt;
        /* delay uevent until 'holders' subdir is created */
@@ -479,8 +503,11 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        if (!dev_get_uevent_suppress(ddev))
                kobject_uevent(&pdev->kobj, KOBJ_ADD);
+        hd_ref_init(p);
        return p;
+out_free_info:
+        free_part_info(p);
 out_free_stats:
        free_part_stats(p);
 out_free:
@@ -495,65 +522,6 @@ out_put:
        return ERR_PTR(err);
 }
-/* Not exported, helper to add_disk(). */
-void register_disk(struct gendisk *disk)
-{
-        struct device *ddev = disk_to_dev(disk);
-        struct block_device *bdev;
-        struct disk_part_iter piter;
-        struct hd_struct *part;
-        int err;
-        ddev->parent = disk->driverfs_dev;
-        dev_set_name(ddev, disk->disk_name);
-        /* delay uevents, until we scanned partition table */
-        dev_set_uevent_suppress(ddev, 1);
-        if (device_add(ddev))
-                return;
-#ifndef CONFIG_SYSFS_DEPRECATED
-        err = sysfs_create_link(block_depr, &ddev->kobj,
-                                kobject_name(&ddev->kobj));
-        if (err) {
-                device_del(ddev);
-                return;
-        }
-#endif
-        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
-        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-        /* No minors to use for partitions */
-        if (!disk_partitionable(disk))
-                goto exit;
-        /* No such device (e.g., media were just removed) */
-        if (!get_capacity(disk))
-                goto exit;
-        bdev = bdget_disk(disk, 0);
-        if (!bdev)
-                goto exit;
-        bdev->bd_invalidated = 1;
-        err = blkdev_get(bdev, FMODE_READ);
-        if (err < 0)
-                goto exit;
-        blkdev_put(bdev, FMODE_READ);
-exit:
-        /* announce disk after possible partitions are created */
-        dev_set_uevent_suppress(ddev, 0);
-        kobject_uevent(&ddev->kobj, KOBJ_ADD);
-        /* announce possible partitions */
-        disk_part_iter_init(&piter, disk, 0);
-        while ((part = disk_part_iter_next(&piter)))
-                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
-        disk_part_iter_exit(&piter);
-}
 static bool disk_unlock_native_capacity(struct gendisk *disk)
 {
        const struct block_device_operations *bdops = disk->fops;
@@ -642,6 +610,7 @@ rescan:
        /* add partitions */
        for (p = 1; p < state->limit; p++) {
                sector_t size, from;
+                struct partition_meta_info *info = NULL;
                size = state->parts[p].size;
                if (!size)
@@ -675,8 +644,12 @@ rescan:
                                size = get_capacity(disk) - from;
                        }
                }
+                if (state->parts[p].has_info)
+                        info = &state->parts[p].info;
                part = add_partition(disk, p, from, size,
-                                     state->parts[p].flags);
+                                     state->parts[p].flags,
+                                     &state->parts[p].info);
                if (IS_ERR(part)) {
                        printk(KERN_ERR " %s: p%d could not be added: %ld\n",
                               disk->disk_name, p, -PTR_ERR(part));
@@ -711,34 +684,3 @@ fail:
 }
 EXPORT_SYMBOL(read_dev_sector);
-void del_gendisk(struct gendisk *disk)
-{
-        struct disk_part_iter piter;
-        struct hd_struct *part;
-        /* invalidate stuff */
-        disk_part_iter_init(&piter, disk,
-                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
-        while ((part = disk_part_iter_next(&piter))) {
-                invalidate_partition(disk, part->partno);
-                delete_partition(disk, part->partno);
-        }
-        disk_part_iter_exit(&piter);
-        invalidate_partition(disk, 0);
-        blk_free_devt(disk_to_dev(disk)->devt);
-        set_capacity(disk, 0);
-        disk->flags &= ~GENHD_FL_UP;
-        unlink_gendisk(disk);
-        part_stat_set_all(&disk->part0, 0);
-        disk->part0.stamp = 0;
-        kobject_put(disk->part0.holder_dir);
-        kobject_put(disk->slave_dir);
-        disk->driverfs_dev = NULL;
-#ifndef CONFIG_SYSFS_DEPRECATED
-        sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-#endif
-        device_del(disk_to_dev(disk));
-}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 8e4e103ba216..d68bf4dc3bc2 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/genhd.h>
 /*
 * add_gd_partition adds a partitions details to the devices partition
@@ -12,6 +13,8 @@ struct parsed_partitions {
                sector_t from;
                sector_t size;
                int flags;
+                bool has_info;
+                struct partition_meta_info info;
        } parts[DISK_MAX_PARTS];
        int next;
        int limit;
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index dbb44d4bb8a7..ac0ccb5026a2 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
 *
 ************************************************************/
 #include <linux/crc32.h>
+#include <linux/ctype.h>
 #include <linux/math64.h>
 #include <linux/slab.h>
 #include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
        gpt_entry *ptes = NULL;
        u32 i;
        unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+        u8 unparsed_guid[37];
        if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
                kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
        pr_debug("GUID Partition Table is valid!  Yea!\n");
        for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+                struct partition_meta_info *info;
+                unsigned label_count = 0;
+                unsigned label_max;
                u64 start = le64_to_cpu(ptes[i].starting_lba);
                u64 size = le64_to_cpu(ptes[i].ending_lba) -
                           le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state)
                if (!efi_guidcmp(ptes[i].partition_type_guid,
                                 PARTITION_LINUX_RAID_GUID))
                        state->parts[i + 1].flags = ADDPART_FLAG_RAID;
+                info = &state->parts[i + 1].info;
+                /* Instead of doing a manual swap to big endian, reuse the
+                 * common ASCII hex format as the interim.
+                 */
+                efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
+                part_pack_uuid(unparsed_guid, info->uuid);
+                /* Naively convert UTF16-LE to 7 bits. */
+                label_max = min(sizeof(info->volname) - 1,
+                                sizeof(ptes[i].partition_name));
+                info->volname[label_max] = 0;
+                while (label_count < label_max) {
+                        u8 c = ptes[i].partition_name[label_count] & 0xff;
+                        if (c && !isprint(c))
+                                c = '!';
+                        info->volname[label_count] = c;
+                        label_count++;
+                }
+                state->parts[i + 1].has_info = true;
        }
        kfree(ptes);
        kfree(gpt);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 5bf8a04b5d9b..789c625c7aa5 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -5,7 +5,7 @@
 * Copyright (c) 2001-2007 Anton Altaparmakov
 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
 *
- * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d1fb50b28d86..374242c0971a 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -5,7 +5,7 @@
 * Copyright (c) 2001-2007 Anton Altaparmakov
 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
 *
- * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
diff --git a/fs/pipe.c b/fs/pipe.c
index 279eef96c51c..da42f7db50de 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -382,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
                        error = ops->confirm(pipe, buf);
                        if (error) {
                                if (!ret)
-                                        error = ret;
+                                        ret = error;
                                break;
                        }
@@ -441,7 +441,7 @@ redo:
                        break;
                }
                if (do_wakeup) {
-                        wake_up_interruptible_sync(&pipe->wait);
+                        wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
                }
                pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
        /* Signal writers asynchronously that there is more room. */
        if (do_wakeup) {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
        if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
                        break;
                }
                if (do_wakeup) {
-                        wake_up_interruptible_sync(&pipe->wait);
+                        wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                        do_wakeup = 0;
                }
@@ -623,7 +623,7 @@ redo2:
 out:
        mutex_unlock(&inode->i_mutex);
        if (do_wakeup) {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        }
        if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
        if (!pipe->readers && !pipe->writers) {
                free_pipe_info(inode);
        } else {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
@@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void)
        if (!inode)
                goto fail_inode;
+        inode->i_ino = get_next_ino();
        pipe = alloc_pipe_info(inode);
        if (!pipe)
                goto fail_iput;
@@ -997,12 +999,11 @@ struct file *create_write_pipe(int flags)
                goto err;
        err = -ENOMEM;
-        path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
+        path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
        if (!path.dentry)
                goto err_inode;
        path.mnt = mntget(pipe_mnt);
-        path.dentry->d_op = &pipefs_dentry_operations;
        d_instantiate(path.dentry, inode);
        err = -ENFILE;
@@ -1197,12 +1198,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
        return ret;
 }
+/*
+ * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
+ * location, so checking ->i_pipe is not enough to verify that this is a
+ * pipe.
+ */
+struct pipe_inode_info *get_pipe_info(struct file *file)
+{
+        struct inode *i = file->f_path.dentry->d_inode;
+        return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
+}
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct pipe_inode_info *pipe;
        long ret;
-        pipe = file->f_path.dentry->d_inode->i_pipe;
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -1239,22 +1252,26 @@ out:
        return ret;
 }
+static const struct super_operations pipefs_ops = {
+        .destroy_inode = free_inode_nonrcu,
+};
 /*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pipe: will go nicely and kill the special-casing in procfs.
 */
-static int pipefs_get_sb(struct file_system_type *fs_type,
+static struct dentry *pipefs_mount(struct file_system_type *fs_type,
-                         int flags, const char *dev_name, void *data,
+                         int flags, const char *dev_name, void *data)
-                         struct vfsmount *mnt)
 {
-        return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
+        return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+                        &pipefs_dentry_operations, PIPEFS_MAGIC);
 }
 static struct file_system_type pipe_fs_type = {
        .name           = "pipefs",
-        .get_sb         = pipefs_get_sb,
+        .mount          = pipefs_mount,
        .kill_sb        = kill_anon_super,
 };
diff --git a/fs/pnode.c b/fs/pnode.c
index 8066b8dd748f..d42514e32380 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -288,7 +288,7 @@ out:
 */
 static inline int do_refcount_check(struct vfsmount *mnt, int count)
 {
-        int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts;
+        int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
        return (mycount > count);
 }
@@ -300,7 +300,7 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
 * Check if any of these mounts that **do not have submounts**
 * have more references than 'refcnt'. If so return busy.
 *
- * vfsmount lock must be held for read or write
+ * vfsmount lock must be held for write
 */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 39df95a0ec25..b1cf6bf4b41d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
 #include <linux/errno.h>
+EXPORT_SYMBOL(posix_acl_init);
 EXPORT_SYMBOL(posix_acl_alloc);
 EXPORT_SYMBOL(posix_acl_clone);
 EXPORT_SYMBOL(posix_acl_valid);
@@ -32,6 +33,16 @@ EXPORT_SYMBOL(posix_acl_chmod_masq);
 EXPORT_SYMBOL(posix_acl_permission);
 /*
+ * Init a fresh posix_acl
+ */
+void
+posix_acl_init(struct posix_acl *acl, int count)
+{
+        atomic_set(&acl->a_refcount, 1);
+        acl->a_count = count;
+}
+/*
 * Allocate a new ACL with the specified number of entries.
 */
 struct posix_acl *
@@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags)
        const size_t size = sizeof(struct posix_acl) +
                            count * sizeof(struct posix_acl_entry);
        struct posix_acl *acl = kmalloc(size, flags);
-        if (acl) {
+        if (acl)
-                atomic_set(&acl->a_refcount, 1);
+                posix_acl_init(acl, count);
-                acl->a_count = count;
-        }
        return acl;
 }
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f06..15af6222f8a4 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
 config PROC_FS
-        bool "/proc file system support" if EMBEDDED
+        bool "/proc file system support" if EXPERT
        default y
        help
          This is a virtual file system providing information about the status
@@ -33,14 +33,14 @@ config PROC_KCORE
        depends on PROC_FS && MMU
 config PROC_VMCORE
-        bool "/proc/vmcore support (EXPERIMENTAL)"
+        bool "/proc/vmcore support"
-        depends on PROC_FS && CRASH_DUMP
+        depends on PROC_FS && CRASH_DUMP
        default y
        help
        Exports the dump image of crashed kernel in ELF format.
 config PROC_SYSCTL
-        bool "Sysctl support (/proc/sys)" if EMBEDDED
+        bool "Sysctl support (/proc/sys)" if EXPERT
        depends on PROC_FS
        select SYSCTL
        default y
@@ -61,7 +61,7 @@ config PROC_SYSCTL
 config PROC_PAGE_MONITOR
        default y
        depends on PROC_FS && MMU
-        bool "Enable /proc page monitoring" if EMBEDDED
+        bool "Enable /proc page monitoring" if EXPERT
        help
          Various /proc files exist to monitor process memory utilization:
          /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 2758e2afc518..df434c5f28fb 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,6 +10,7 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
                proc_tty.o
 proc-y  += cmdline.o
+proc-y  += consoles.o
 proc-y  += cpuinfo.o
 proc-y  += devices.o
 proc-y  += interrupts.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fff6572676ae..df2b703b9d0f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -95,7 +95,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
        get_task_comm(tcomm, p);
-        seq_printf(m, "Name:\t");
+        seq_puts(m, "Name:\t");
        end = m->buf + m->size;
        buf = m->buf + m->count;
        name = tcomm;
@@ -122,7 +122,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
                buf++;
        }
        m->count = buf - m->buf;
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 /*
@@ -208,7 +208,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                seq_printf(m, "%d ", GROUP_AT(group_info, g));
        put_cred(cred);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static void render_sigset_t(struct seq_file *m, const char *header,
@@ -216,7 +216,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
 {
        int i;
-        seq_printf(m, "%s", header);
+        seq_puts(m, header);
        i = _NSIG;
        do {
@@ -230,7 +230,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
                seq_printf(m, "%x", x);
        } while (i >= 4);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
@@ -291,12 +291,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
 {
        unsigned __capi;
-        seq_printf(m, "%s", header);
+        seq_puts(m, header);
        CAP_FOR_EACH_U32(__capi) {
                seq_printf(m, "%08x",
                           a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
        }
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
@@ -329,12 +329,12 @@ static inline void task_context_switch_counts(struct seq_file *m,
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
-        seq_printf(m, "Cpus_allowed:\t");
+        seq_puts(m, "Cpus_allowed:\t");
        seq_cpumask(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
-        seq_printf(m, "Cpus_allowed_list:\t");
+        seq_puts(m, "Cpus_allowed_list:\t");
        seq_cpumask_list(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -535,15 +535,15 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
 int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
 {
-        int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
+        unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
        struct mm_struct *mm = get_task_mm(task);
        if (mm) {
                size = task_statm(mm, &shared, &text, &data, &resident);
                mmput(mm);
        }
-        seq_printf(m, "%d %d %d %d %d %d %d\n",
+        seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
-                        size, resident, shared, text, lib, data, 0);
+                        size, resident, shared, text, data);
        return 0;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e4addaa5424..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -226,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 {
        struct mm_struct *mm;
-        if (mutex_lock_killable(&task->cred_guard_mutex))
+        if (mutex_lock_killable(&task->signal->cred_guard_mutex))
                return NULL;
        mm = get_task_mm(task);
@@ -235,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
                mmput(mm);
                mm = NULL;
        }
-        mutex_unlock(&task->cred_guard_mutex);
+        mutex_unlock(&task->signal->cred_guard_mutex);
        return mm;
 }
@@ -373,26 +373,20 @@ static int lstats_show_proc(struct seq_file *m, void *v)
                return -ESRCH;
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < 32; i++) {
-                if (task->latency_record[i].backtrace[0]) {
+                struct latency_record *lr = &task->latency_record[i];
+                if (lr->backtrace[0]) {
                        int q;
-                        seq_printf(m, "%i %li %li ",
+                        seq_printf(m, "%i %li %li",
-                                task->latency_record[i].count,
+                                   lr->count, lr->time, lr->max);
-                                task->latency_record[i].time,
-                                task->latency_record[i].max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-                                char sym[KSYM_SYMBOL_LEN];
+                                unsigned long bt = lr->backtrace[q];
-                                char *c;
+                                if (!bt)
-                                if (!task->latency_record[i].backtrace[q])
                                        break;
-                                if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+                                if (bt == ULONG_MAX)
                                        break;
-                                sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+                                seq_printf(m, " %ps", (void *)bt);
-                                c = strchr(sym, '+');
-                                if (c)
-                                        *c = 0;
-                                seq_printf(m, "%s ", sym);
                        }
-                        seq_printf(m, "\n");
+                        seq_putc(m, '\n');
                }
        }
@@ -751,14 +745,7 @@ static int proc_single_show(struct seq_file *m, void *v)
 static int proc_single_open(struct inode *inode, struct file *filp)
 {
-        int ret;
+        return single_open(filp, proc_single_show, inode);
-        ret = single_open(filp, proc_single_show, NULL);
-        if (!ret) {
-                struct seq_file *m = filp->private_data;
-                m->private = inode;
-        }
-        return ret;
 }
 static const struct file_operations proc_single_file_operations = {
@@ -771,6 +758,8 @@ static const struct file_operations proc_single_file_operations = {
 static int mem_open(struct inode* inode, struct file* file)
 {
        file->private_data = (void*)((long)current->self_exec_id);
+        /* OK to pass negative loff_t, we can catch out-of-range */
+        file->f_mode |= FMODE_UNSIGNED_OFFSET;
        return 0;
 }
@@ -1023,28 +1012,47 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
-        if (copy_from_user(buffer, buf, count))
+        if (copy_from_user(buffer, buf, count)) {
-                return -EFAULT;
+                err = -EFAULT;
+                goto out;
+        }
        err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
        if (err)
-                return -EINVAL;
+                goto out;
        if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
-             oom_adjust != OOM_DISABLE)
+             oom_adjust != OOM_DISABLE) {
-                return -EINVAL;
+                err = -EINVAL;
+                goto out;
+        }
        task = get_proc_task(file->f_path.dentry->d_inode);
-        if (!task)
+        if (!task) {
-                return -ESRCH;
+                err = -ESRCH;
+                goto out;
+        }
+        task_lock(task);
+        if (!task->mm) {
+                err = -EINVAL;
+                goto err_task_lock;
+        }
        if (!lock_task_sighand(task, &flags)) {
-                put_task_struct(task);
+                err = -ESRCH;
-                return -ESRCH;
+                goto err_task_lock;
        }
        if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-                unlock_task_sighand(task, &flags);
+                err = -EACCES;
-                put_task_struct(task);
+                goto err_sighand;
-                return -EACCES;
+        }
+        if (oom_adjust != task->signal->oom_adj) {
+                if (oom_adjust == OOM_DISABLE)
+                        atomic_inc(&task->mm->oom_disable_count);
+                if (task->signal->oom_adj == OOM_DISABLE)
+                        atomic_dec(&task->mm->oom_disable_count);
        }
        /*
@@ -1065,10 +1073,13 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
                                                                -OOM_DISABLE;
+err_sighand:
        unlock_task_sighand(task, &flags);
+err_task_lock:
+        task_unlock(task);
        put_task_struct(task);
+out:
-        return count;
+        return err < 0 ? err : count;
 }
 static const struct file_operations proc_oom_adjust_operations = {
@@ -1109,31 +1120,52 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
-        if (copy_from_user(buffer, buf, count))
+        if (copy_from_user(buffer, buf, count)) {
-                return -EFAULT;
+                err = -EFAULT;
+                goto out;
+        }
        err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
        if (err)
-                return -EINVAL;
+                goto out;
        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
-                        oom_score_adj > OOM_SCORE_ADJ_MAX)
+                        oom_score_adj > OOM_SCORE_ADJ_MAX) {
-                return -EINVAL;
+                err = -EINVAL;
+                goto out;
+        }
        task = get_proc_task(file->f_path.dentry->d_inode);
-        if (!task)
+        if (!task) {
-                return -ESRCH;
+                err = -ESRCH;
+                goto out;
+        }
+        task_lock(task);
+        if (!task->mm) {
+                err = -EINVAL;
+                goto err_task_lock;
+        }
        if (!lock_task_sighand(task, &flags)) {
-                put_task_struct(task);
+                err = -ESRCH;
-                return -ESRCH;
+                goto err_task_lock;
        }
-        if (oom_score_adj < task->signal->oom_score_adj &&
+        if (oom_score_adj < task->signal->oom_score_adj_min &&
                        !capable(CAP_SYS_RESOURCE)) {
-                unlock_task_sighand(task, &flags);
+                err = -EACCES;
-                put_task_struct(task);
+                goto err_sighand;
-                return -EACCES;
        }
+        if (oom_score_adj != task->signal->oom_score_adj) {
+                if (oom_score_adj == OOM_SCORE_ADJ_MIN)
+                        atomic_inc(&task->mm->oom_disable_count);
+                if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                        atomic_dec(&task->mm->oom_disable_count);
+        }
        task->signal->oom_score_adj = oom_score_adj;
+        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+                task->signal->oom_score_adj_min = oom_score_adj;
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
         * always attainable.
@@ -1143,14 +1175,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
                                                        OOM_SCORE_ADJ_MAX;
+err_sighand:
        unlock_task_sighand(task, &flags);
+err_task_lock:
+        task_unlock(task);
        put_task_struct(task);
-        return count;
+out:
+        return err < 0 ? err : count;
 }
 static const struct file_operations proc_oom_score_adj_operations = {
        .read           = oom_score_adj_read,
        .write          = oom_score_adj_write,
+        .llseek         = default_llseek,
 };
 #ifdef CONFIG_AUDITSYSCALL
@@ -1338,9 +1375,77 @@ sched_write(struct file *file, const char __user *buf,
 static int sched_open(struct inode *inode, struct file *filp)
 {
+        return single_open(filp, sched_show, inode);
+}
+static const struct file_operations proc_pid_sched_operations = {
+        .open           = sched_open,
+        .read           = seq_read,
+        .write          = sched_write,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+        struct inode *inode = m->private;
+        struct task_struct *p;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        proc_sched_autogroup_show_task(p, m);
+        put_task_struct(p);
+        return 0;
+}
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+            size_t count, loff_t *offset)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct task_struct *p;
+        char buffer[PROC_NUMBUF];
+        long nice;
+        int err;
+        memset(buffer, 0, sizeof(buffer));
+        if (count > sizeof(buffer) - 1)
+                count = sizeof(buffer) - 1;
+        if (copy_from_user(buffer, buf, count))
+                return -EFAULT;
+        err = strict_strtol(strstrip(buffer), 0, &nice);
+        if (err)
+                return -EINVAL;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        err = nice;
+        err = proc_sched_autogroup_set_nice(p, &err);
+        if (err)
+                count = err;
+        put_task_struct(p);
+        return count;
+}
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
        int ret;
-        ret = single_open(filp, sched_show, NULL);
+        ret = single_open(filp, sched_autogroup_show, NULL);
        if (!ret) {
                struct seq_file *m = filp->private_data;
@@ -1349,15 +1454,15 @@ static int sched_open(struct inode *inode, struct file *filp)
        return ret;
 }
-static const struct file_operations proc_pid_sched_operations = {
+static const struct file_operations proc_pid_sched_autogroup_operations = {
-        .open           = sched_open,
+        .open           = sched_autogroup_open,
        .read           = seq_read,
-        .write          = sched_write,
+        .write          = sched_autogroup_write,
        .llseek         = seq_lseek,
        .release        = single_release,
 };
-#endif
+#endif /* CONFIG_SCHED_AUTOGROUP */
 static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
@@ -1406,15 +1511,7 @@ static int comm_show(struct seq_file *m, void *v)
 static int comm_open(struct inode *inode, struct file *filp)
 {
-        int ret;
+        return single_open(filp, comm_show, inode);
-        ret = single_open(filp, comm_show, NULL);
-        if (!ret) {
-                struct seq_file *m = filp->private_data;
-                m->private = inode;
-        }
-        return ret;
 }
 static const struct file_operations proc_pid_set_comm_operations = {
@@ -1526,7 +1623,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
        if (!tmp)
                return -ENOMEM;
-        pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
+        pathname = d_path(path, tmp, PAGE_SIZE);
        len = PTR_ERR(pathname);
        if (IS_ERR(pathname))
                goto out;
@@ -1600,6 +1697,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
        /* Common stuff */
        ei = PROC_I(inode);
+        inode->i_ino = get_next_ino();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_op = &proc_def_inode_operations;
@@ -1670,10 +1768,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
        const struct cred *cred;
+        if (nd && nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
        if (task) {
                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
                    task_dumpable(task)) {
@@ -1695,7 +1799,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
        return 0;
 }
-static int pid_delete_dentry(struct dentry * dentry)
+static int pid_delete_dentry(const struct dentry * dentry)
 {
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
@@ -1839,12 +1943,19 @@ static int proc_fd_link(struct inode *inode, struct path *path)
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
-        int fd = proc_fd(inode);
+        int fd;
        struct files_struct *files;
        const struct cred *cred;
+        if (nd && nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
+        fd = proc_fd(inode);
        if (task) {
                files = get_files_struct(task);
                if (files) {
@@ -1920,7 +2031,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
        inode->i_op = &proc_pid_link_inode_operations;
        inode->i_size = 64;
        ei->op.proc_get_link = proc_fd_link;
-        dentry->d_op = &tid_fd_dentry_operations;
+        d_set_d_op(dentry, &tid_fd_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (tid_fd_revalidate(dentry, NULL))
@@ -2039,22 +2150,26 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
 static const struct file_operations proc_fdinfo_file_operations = {
        .open           = nonseekable_open,
        .read           = proc_fdinfo_read,
+        .llseek         = no_llseek,
 };
 static const struct file_operations proc_fd_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_readfd,
+        .llseek         = default_llseek,
 };
 /*
 * /proc/pid/fd needs a special permission handler so that a process can still
 * access /proc/self/fd after it has executed a setuid().
 */
-static int proc_fd_permission(struct inode *inode, int mask)
+static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int rv;
-        rv = generic_permission(inode, mask, NULL);
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        rv = generic_permission(inode, mask, flags, NULL);
        if (rv == 0)
                return 0;
        if (task_pid(current) == proc_pid(inode))
@@ -2086,7 +2201,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
        ei->fd = fd;
        inode->i_mode = S_IFREG | S_IRUSR;
        inode->i_fop = &proc_fdinfo_file_operations;
-        dentry->d_op = &tid_fd_dentry_operations;
+        d_set_d_op(dentry, &tid_fd_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (tid_fd_revalidate(dentry, NULL))
@@ -2112,6 +2227,7 @@ static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
 static const struct file_operations proc_fdinfo_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_readfdinfo,
+        .llseek         = default_llseek,
 };
 /*
@@ -2144,7 +2260,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (pid_revalidate(dentry, NULL))
@@ -2302,14 +2418,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
                goto out_free;
        /* Guard against adverse ptrace interaction */
-        length = mutex_lock_interruptible(&task->cred_guard_mutex);
+        length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
        if (length < 0)
                goto out_free;
        length = security_setprocattr(task,
                                      (char*)file->f_path.dentry->d_name.name,
                                      (void*)page, count);
-        mutex_unlock(&task->cred_guard_mutex);
+        mutex_unlock(&task->signal->cred_guard_mutex);
 out_free:
        free_page((unsigned long) page);
 out:
@@ -2343,6 +2459,7 @@ static int proc_attr_dir_readdir(struct file * filp,
 static const struct file_operations proc_attr_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_attr_dir_readdir,
+        .llseek         = default_llseek,
 };
 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@ -2510,8 +2627,14 @@ static const struct pid_entry proc_base_stuff[] = {
 */
 static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
        if (task) {
                put_task_struct(task);
                return 1;
@@ -2542,6 +2665,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        /* Initialize the inode */
        ei = PROC_I(inode);
+        inode->i_ino = get_next_ino();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        /*
@@ -2561,7 +2685,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        dentry->d_op = &proc_base_dentry_operations;
+        d_set_d_op(dentry, &proc_base_dentry_operations);
        d_add(dentry, inode);
        error = NULL;
 out:
@@ -2679,6 +2803,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+        REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        INF("syscall",    S_IRUSR, proc_pid_syscall),
@@ -2751,6 +2878,7 @@ static int proc_tgid_base_readdir(struct file * filp,
 static const struct file_operations proc_tgid_base_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_tgid_base_readdir,
+        .llseek         = default_llseek,
 };
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -2871,7 +2999,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
        inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
                ARRAY_SIZE(tgid_base_stuff));
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
@@ -3088,6 +3216,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
 static const struct file_operations proc_tid_base_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_tid_base_readdir,
+        .llseek         = default_llseek,
 };
 static const struct inode_operations proc_tid_base_inode_operations = {
@@ -3113,7 +3242,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
        inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
                ARRAY_SIZE(tid_base_stuff));
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
@@ -3324,4 +3453,5 @@ static const struct inode_operations proc_task_inode_operations = {
 static const struct file_operations proc_task_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_task_readdir,
+        .llseek         = default_llseek,
 };
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
new file mode 100644
index 000000000000..b701eaa482bf
--- /dev/null
+++ b/fs/proc/consoles.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2010 Werner Fink, Jiri Slaby
+ *
+ * Licensed under GPLv2
+ */
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/tty_driver.h>
+/*
+ * This is handler for /proc/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+        static const struct {
+                short flag;
+                char name;
+        } con_flags[] = {
+                { CON_ENABLED,          'E' },
+                { CON_CONSDEV,          'C' },
+                { CON_BOOT,             'B' },
+                { CON_PRINTBUFFER,      'p' },
+                { CON_BRL,              'b' },
+                { CON_ANYTIME,          'a' },
+        };
+        char flags[ARRAY_SIZE(con_flags) + 1];
+        struct console *con = v;
+        unsigned int a;
+        int len;
+        dev_t dev = 0;
+        if (con->device) {
+                const struct tty_driver *driver;
+                int index;
+                driver = con->device(con, &index);
+                if (driver) {
+                        dev = MKDEV(driver->major, driver->minor_start);
+                        dev += index;
+                }
+        }
+        for (a = 0; a < ARRAY_SIZE(con_flags); a++)
+                flags[a] = (con->flags & con_flags[a].flag) ?
+                        con_flags[a].name : ' ';
+        flags[a] = 0;
+        seq_printf(m, "%s%d%n", con->name, con->index, &len);
+        len = 21 - len;
+        if (len < 1)
+                len = 1;
+        seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+                        con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+                        flags);
+        if (dev)
+                seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
+        seq_printf(m, "\n");
+        return 0;
+}
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+        struct console *con;
+        loff_t off = 0;
+        console_lock();
+        for_each_console(con)
+                if (off++ == *pos)
+                        break;
+        return con;
+}
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct console *con = v;
+        ++*pos;
+        return con->next;
+}
+static void c_stop(struct seq_file *m, void *v)
+{
+        console_unlock();
+}
+static const struct seq_operations consoles_op = {
+        .start  = c_start,
+        .next   = c_next,
+        .stop   = c_stop,
+        .show   = show_console_dev
+};
+static int consoles_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &consoles_op);
+}
+static const struct file_operations proc_consoles_operations = {
+        .open           = consoles_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int __init proc_consoles_init(void)
+{
+        proc_create("consoles", 0, NULL, &proc_consoles_operations);
+        return 0;
+}
+module_init(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da959c9..b14347167c35 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v)
        if (i < CHRDEV_MAJOR_HASH_SIZE) {
                if (i == 0)
-                        seq_printf(f, "Character devices:\n");
+                        seq_puts(f, "Character devices:\n");
                chrdev_show(f, i);
        }
 #ifdef CONFIG_BLOCK
        else {
                i -= CHRDEV_MAJOR_HASH_SIZE;
                if (i == 0)
-                        seq_printf(f, "\nBlock devices:\n");
+                        seq_puts(f, "\nBlock devices:\n");
                blkdev_show(f, i);
        }
 #endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index dd29f0337661..01e07f2a188f 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -400,7 +400,7 @@ static const struct inode_operations proc_link_inode_operations = {
 * smarter: we could keep a "volatile" flag in the 
 * inode to indicate which ones to keep.
 */
-static int proc_delete_dentry(struct dentry * dentry)
+static int proc_delete_dentry(const struct dentry * dentry)
 {
        return 1;
 }
@@ -425,13 +425,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                if (de->namelen != dentry->d_name.len)
                        continue;
                if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-                        unsigned int ino;
-                        ino = de->low_ino;
                        pde_get(de);
                        spin_unlock(&proc_subdir_lock);
                        error = -EINVAL;
-                        inode = proc_get_inode(dir->i_sb, ino, de);
+                        inode = proc_get_inode(dir->i_sb, de);
                        goto out_unlock;
                }
        }
@@ -439,7 +436,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 out_unlock:
        if (inode) {
-                dentry->d_op = &proc_dentry_operations;
+                d_set_d_op(dentry, &proc_dentry_operations);
                d_add(dentry, inode);
                return NULL;
        }
@@ -768,12 +765,7 @@ EXPORT_SYMBOL(proc_create_data);
 static void free_proc_entry(struct proc_dir_entry *de)
 {
-        unsigned int ino = de->low_ino;
+        release_inode_number(de->low_ino);
-        if (ino < PROC_DYNAMIC_FIRST)
-                return;
-        release_inode_number(ino);
        if (S_ISLNK(de->mode))
                kfree(de->data);
@@ -834,12 +826,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
                wait_for_completion(de->pde_unload_completion);
-                goto continue_removing;
+                spin_lock(&de->pde_unload_lock);
        }
-        spin_unlock(&de->pde_unload_lock);
-continue_removing:
-        spin_lock(&de->pde_unload_lock);
        while (!list_empty(&de->pde_openers)) {
                struct pde_opener *pdeo;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 9c2b5f484879..176ce4cda68a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -16,7 +16,6 @@
 #include <linux/limits.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
@@ -66,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
        return inode;
 }
-static void proc_destroy_inode(struct inode *inode)
+static void proc_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
+static void proc_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, proc_i_callback);
+}
 static void init_once(void *foo)
 {
        struct proc_inode *ei = (struct proc_inode *) foo;
@@ -410,12 +416,11 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
-struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
+struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
-                                struct proc_dir_entry *de)
 {
        struct inode * inode;
-        inode = iget_locked(sb, ino);
+        inode = iget_locked(sb, de->low_ino);
        if (!inode)
                return NULL;
        if (inode->i_state & I_NEW) {
@@ -465,7 +470,7 @@ int proc_fill_super(struct super_block *s)
        s->s_time_gran = 1;
        
        pde_get(&proc_root);
-        root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
+        root_inode = proc_get_inode(s, &proc_root);
        if (!root_inode)
                goto out_no_root;
        root_inode->i_uid = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3eddd12..9ad561ded409 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -96,7 +96,8 @@ extern spinlock_t proc_subdir_lock;
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
+unsigned long task_statm(struct mm_struct *,
+        unsigned long *, unsigned long *, unsigned long *, unsigned long *);
 void task_mem(struct seq_file *, struct mm_struct *);
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
@@ -108,7 +109,7 @@ void pde_put(struct proc_dir_entry *pde);
 extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 /*
 * These are generic /proc routines that use the internal
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6f37c391468d..d245cb23dd72 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
 static const struct file_operations proc_kcore_operations = {
        .read           = read_kcore,
        .open           = open_kcore,
-        .llseek         = generic_file_llseek,
+        .llseek         = default_llseek,
 };
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
                "HardwareCorrupted: %5lu kB\n"
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                "AnonHugePages:  %8lu kB\n"
+#endif
                ,
                K(i.totalram),
                K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.freeswap),
                K(global_page_state(NR_FILE_DIRTY)),
                K(global_page_state(NR_WRITEBACK)),
-                K(global_page_state(NR_ANON_PAGES)),
+                K(global_page_state(NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+                  HPAGE_PMD_NR
+#endif
+                  ),
                K(global_page_state(NR_FILE_MAPPED)),
                K(global_page_state(NR_SHMEM)),
                K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
                ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+                   HPAGE_PMD_NR)
+#endif
                );
        hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3b8b45660331..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
                        ppage = pfn_to_page(pfn);
                else
                        ppage = NULL;
-                if (!ppage)
+                if (!ppage || PageSlab(ppage))
                        pcount = 0;
                else
                        pcount = page_mapcount(ppage);
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
        if (PageHuge(page))
                u |= 1 << KPF_HUGE;
-        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
        /*
-         * Caveats on high order pages:
+         * Caveats on high order pages: page->_count will only be set
-         * PG_buddy will only be set on the head page; SLUB/SLQB do the same
+         * -1 on the head page; SLUB/SLQB do the same for PG_slab;
-         * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+         * SLOB won't set PG_slab at all on compound pages.
         */
+        if (PageBuddy(page))
+                u |= 1 << KPF_BUDDY;
+        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
-        u |= kpf_copy_bit(k, KPF_BUDDY,         PG_buddy);
        u |= kpf_copy_bit(k, KPF_ERROR,         PG_error);
        u |= kpf_copy_bit(k, KPF_DIRTY,         PG_dirty);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5be436ea088e..09a1f92a34ef 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -5,6 +5,7 @@
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
+#include <linux/namei.h>
 #include "internal.h"
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -23,6 +24,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        if (!inode)
                goto out;
+        inode->i_ino = get_next_ino();
        sysctl_head_get(head);
        ei = PROC_I(inode);
        ei->sysctl = head;
@@ -118,7 +121,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
                goto out;
        err = NULL;
-        dentry->d_op = &proc_sys_dentry_operations;
+        d_set_d_op(dentry, &proc_sys_dentry_operations);
        d_add(dentry, inode);
 out:
@@ -199,7 +202,7 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
                                dput(child);
                                return -ENOMEM;
                        } else {
-                                child->d_op = &proc_sys_dentry_operations;
+                                d_set_d_op(child, &proc_sys_dentry_operations);
                                d_add(child, inode);
                        }
                } else {
@@ -292,7 +295,7 @@ out:
        return ret;
 }
-static int proc_sys_permission(struct inode *inode, int mask)
+static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
 {
        /*
         * sysctl entries that are not writeable,
@@ -302,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask)
        struct ctl_table *table;
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        /* Executable files are not allowed under /proc/sys/ */
        if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
                return -EACCES;
@@ -364,6 +370,7 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 static const struct file_operations proc_sys_file_operations = {
        .read           = proc_sys_read,
        .write          = proc_sys_write,
+        .llseek         = default_llseek,
 };
 static const struct file_operations proc_sys_dir_file_operations = {
@@ -386,23 +393,30 @@ static const struct inode_operations proc_sys_dir_operations = {
 static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        return !PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
-static int proc_sys_delete(struct dentry *dentry)
+static int proc_sys_delete(const struct dentry *dentry)
 {
        return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
-static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
+static int proc_sys_compare(const struct dentry *parent,
-                            struct qstr *name)
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct dentry *dentry = container_of(qstr, struct dentry, d_name);
+        /* Although proc doesn't have negative dentries, rcu-walk means
-        if (qstr->len != name->len)
+         * that inode here can be NULL */
+        if (!inode)
+                return 0;
+        if (name->len != len)
                return 1;
-        if (memcmp(qstr->name, name->name, name->len))
+        if (memcmp(name->name, str, len))
                return 1;
-        return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
+        return !sysctl_is_seen(PROC_I(inode)->sysctl);
 }
 static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc869437..cb761f010300 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
        }
        switch (p->type) {
        case TTY_DRIVER_TYPE_SYSTEM:
-                seq_printf(m, "system");
+                seq_puts(m, "system");
                if (p->subtype == SYSTEM_TYPE_TTY)
-                        seq_printf(m, ":/dev/tty");
+                        seq_puts(m, ":/dev/tty");
                else if (p->subtype == SYSTEM_TYPE_SYSCONS)
-                        seq_printf(m, ":console");
+                        seq_puts(m, ":console");
                else if (p->subtype == SYSTEM_TYPE_CONSOLE)
-                        seq_printf(m, ":vtmaster");
+                        seq_puts(m, ":vtmaster");
                break;
        case TTY_DRIVER_TYPE_CONSOLE:
-                seq_printf(m, "console");
+                seq_puts(m, "console");
                break;
        case TTY_DRIVER_TYPE_SERIAL:
-                seq_printf(m, "serial");
+                seq_puts(m, "serial");
                break;
        case TTY_DRIVER_TYPE_PTY:
                if (p->subtype == PTY_TYPE_MASTER)
-                        seq_printf(m, "pty:master");
+                        seq_puts(m, "pty:master");
                else if (p->subtype == PTY_TYPE_SLAVE)
-                        seq_printf(m, "pty:slave");
+                        seq_puts(m, "pty:slave");
                else
-                        seq_printf(m, "pty");
+                        seq_puts(m, "pty");
                break;
        default:
                seq_printf(m, "type:%d.%d", p->type, p->subtype);
@@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v)
                /* pseudo-drivers first */
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
-                seq_printf(m, "system:/dev/tty\n");
+                seq_puts(m, "system:/dev/tty\n");
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
-                seq_printf(m, "system:console\n");
+                seq_puts(m, "system:console\n");
 #ifdef CONFIG_UNIX98_PTYS
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
-                seq_printf(m, "system\n");
+                seq_puts(m, "system\n");
 #endif
 #ifdef CONFIG_VT
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
                seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
-                seq_printf(m, "system:vtmaster\n");
+                seq_puts(m, "system:vtmaster\n");
 #endif
        }
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384ed22d..ef9fa8e24ad6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -35,8 +35,8 @@ static int proc_set_super(struct super_block *sb, void *data)
        return set_anon_super(sb, NULL);
 }
-static int proc_get_sb(struct file_system_type *fs_type,
+static struct dentry *proc_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
        int err;
        struct super_block *sb;
@@ -61,14 +61,14 @@ static int proc_get_sb(struct file_system_type *fs_type,
        sb = sget(fs_type, proc_test_super, proc_set_super, ns);
        if (IS_ERR(sb))
-                return PTR_ERR(sb);
+                return ERR_CAST(sb);
        if (!sb->s_root) {
                sb->s_flags = flags;
                err = proc_fill_super(sb);
                if (err) {
                        deactivate_locked_super(sb);
-                        return err;
+                        return ERR_PTR(err);
                }
                ei = PROC_I(sb->s_root->d_inode);
@@ -79,11 +79,9 @@ static int proc_get_sb(struct file_system_type *fs_type,
                }
                sb->s_flags |= MS_ACTIVE;
-                ns->proc_mnt = mnt;
        }
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 }
 static void proc_kill_sb(struct super_block *sb)
@@ -97,7 +95,7 @@ static void proc_kill_sb(struct super_block *sb)
 static struct file_system_type proc_fs_type = {
        .name           = "proc",
-        .get_sb         = proc_get_sb,
+        .mount          = proc_mount,
        .kill_sb        = proc_kill_sb,
 };
@@ -115,6 +113,7 @@ void __init proc_root_init(void)
                return;
        }
+        init_pid_ns.proc_mnt = proc_mnt;
        proc_symlink("mounts", NULL, "self/mounts");
        proc_net_init();
@@ -179,6 +178,7 @@ static int proc_root_readdir(struct file * filp,
 static const struct file_operations proc_root_operations = {
        .read            = generic_read_dir,
        .readdir         = proc_root_readdir,
+        .llseek         = default_llseek,
 };
 /*
@@ -212,6 +212,7 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
+        ns->proc_mnt = mnt;
        return 0;
 }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 1807c2419f17..62604be9f58d 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v)
 {
        int i, j;
-        seq_printf(p, "                ");
+        seq_puts(p, "                    ");
        for_each_possible_cpu(i)
                seq_printf(p, "CPU%-8d", i);
-        seq_printf(p, "\n");
+        seq_putc(p, '\n');
        for (i = 0; i < NR_SOFTIRQS; i++) {
-                seq_printf(p, "%8s:", softirq_to_name[i]);
+                seq_printf(p, "%12s:", softirq_to_name[i]);
                for_each_possible_cpu(j)
                        seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
-                seq_printf(p, "\n");
+                seq_putc(p, '\n');
        }
        return 0;
 }
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf31b03fc275..1cffa2b8a2fc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v)
        u64 sum_softirq = 0;
        unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
        struct timespec boottime;
-        unsigned int per_irq_sum;
        user = nice = system = idle = iowait =
                irq = softirq = steal = cputime64_zero;
@@ -52,9 +51,7 @@ static int show_stat(struct seq_file *p, void *v)
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
                guest_nice = cputime64_add(guest_nice,
                        kstat_cpu(i).cpustat.guest_nice);
-                for_each_irq_nr(j) {
+                sum += kstat_cpu_irqs_sum(i);
-                        sum += kstat_irqs_cpu(j, i);
-                }
                sum += arch_irq_stat_cpu(i);
                for (j = 0; j < NR_SOFTIRQS; j++) {
@@ -110,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v)
        seq_printf(p, "intr %llu", (unsigned long long)sum);
        /* sum again ? it could be updated? */
-        for_each_irq_nr(j) {
+        for_each_irq_nr(j)
-                per_irq_sum = 0;
+                seq_printf(p, " %u", kstat_irqs(j));
-                for_each_possible_cpu(i)
-                        per_irq_sum += kstat_irqs_cpu(j, i);
-                seq_printf(p, " %u", per_irq_sum);
-        }
        seq_printf(p,
                "\nctxt %llu\n"
@@ -134,7 +126,7 @@ static int show_stat(struct seq_file *p, void *v)
        for (i = 0; i < NR_SOFTIRQS; i++)
                seq_printf(p, " %u", per_softirq_sums[i]);
-        seq_printf(p, "\n");
+        seq_putc(p, '\n');
        return 0;
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1dbca4e8cc16..60b914860f81 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -66,8 +66,9 @@ unsigned long task_vsize(struct mm_struct *mm)
        return PAGE_SIZE * mm->total_vm;
 }
-int task_statm(struct mm_struct *mm, int *shared, int *text,
+unsigned long task_statm(struct mm_struct *mm,
-               int *data, int *resident)
+                         unsigned long *shared, unsigned long *text,
+                         unsigned long *data, unsigned long *resident)
 {
        *shared = get_mm_counter(mm, MM_FILEPAGES);
        *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
@@ -327,6 +328,7 @@ struct mem_size_stats {
        unsigned long private_clean;
        unsigned long private_dirty;
        unsigned long referenced;
+        unsigned long anonymous;
        unsigned long swap;
        u64 pss;
 };
@@ -357,6 +359,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                if (!page)
                        continue;
+                if (PageAnon(page))
+                        mss->anonymous += PAGE_SIZE;
                mss->resident += PAGE_SIZE;
                /* Accumulate the size in pages that have been accessed. */
                if (pte_young(ptent) || PageReferenced(page))
@@ -410,9 +415,11 @@ static int show_smap(struct seq_file *m, void *v)
                   "Private_Clean:  %8lu kB\n"
                   "Private_Dirty:  %8lu kB\n"
                   "Referenced:     %8lu kB\n"
+                   "Anonymous:      %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
-                   "MMUPageSize:    %8lu kB\n",
+                   "MMUPageSize:    %8lu kB\n"
+                   "Locked:         %8lu kB\n",
                   (vma->vm_end - vma->vm_start) >> 10,
                   mss.resident >> 10,
                   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -421,9 +428,12 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.private_clean >> 10,
                   mss.private_dirty >> 10,
                   mss.referenced >> 10,
+                   mss.anonymous >> 10,
                   mss.swap >> 10,
                   vma_kernel_pagesize(vma) >> 10,
-                   vma_mmu_pagesize(vma) >> 10);
+                   vma_mmu_pagesize(vma) >> 10,
+                   (vma->vm_flags & VM_LOCKED) ?
+                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
@@ -539,6 +549,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 const struct file_operations proc_clear_refs_operations = {
        .write          = clear_refs_write,
+        .llseek         = noop_llseek,
 };
 struct pagemapread {
@@ -699,6 +710,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 * skip over unmapped regions.
 */
 #define PAGEMAP_WALK_SIZE       (PMD_SIZE)
+#define PAGEMAP_WALK_MASK       (PMD_MASK)
 static ssize_t pagemap_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
@@ -769,7 +781,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                unsigned long end;
                pm.pos = 0;
-                end = start_vaddr + PAGEMAP_WALK_SIZE;
+                end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
                /* overflow ? */
                if (end < start_vaddr || end > end_vaddr)
                        end = end_vaddr;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index cb6306e63843..b535d3e5d5f1 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -92,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm)
        return vsize;
 }
-int task_statm(struct mm_struct *mm, int *shared, int *text,
+unsigned long task_statm(struct mm_struct *mm,
-               int *data, int *resident)
+                         unsigned long *shared, unsigned long *text,
+                         unsigned long *data, unsigned long *resident)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
        struct rb_node *p;
-        int size = kobjsize(mm);
+        unsigned long size = kobjsize(mm);
        down_read(&mm->mmap_sem);
        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 2367fb3f70bc..74802bc5ded9 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -499,7 +499,7 @@ static int __init parse_crash_elf64_headers(void)
        /* Do some basic Verification. */
        if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
                (ehdr.e_type != ET_CORE) ||
-                !vmcore_elf_check_arch(&ehdr) ||
+                !vmcore_elf64_check_arch(&ehdr) ||
                ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
                ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
                ehdr.e_version != EV_CURRENT ||
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6e8fc62b40a8..7b0329468a5d 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,7 +11,6 @@
 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
 */
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "qnx4.h"
@@ -29,8 +28,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
        QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
        QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
-        lock_kernel();
        while (filp->f_pos < inode->i_size) {
                blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
                bh = sb_bread(inode->i_sb, blknum);
@@ -71,7 +68,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
                brelse(bh);
        }
 out:
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 16829722be93..e63b4171d583 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -16,7 +16,6 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/highuid.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
@@ -157,8 +156,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct super_block *sb = dentry->d_sb;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
-        lock_kernel();
        buf->f_type    = sb->s_magic;
        buf->f_bsize   = sb->s_blocksize;
        buf->f_blocks  = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8;
@@ -168,8 +165,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        unlock_kernel();
        return 0;
 }
@@ -283,7 +278,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
                goto outi;
        brelse(bh);
        return 0;
      outi:
@@ -431,11 +425,18 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void qnx4_destroy_inode(struct inode *inode)
+static void qnx4_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
+static void qnx4_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, qnx4_i_callback);
+}
 static void init_once(void *foo)
 {
        struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
@@ -460,17 +461,16 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(qnx4_inode_cachep);
 }
-static int qnx4_get_sb(struct file_system_type *fs_type,
+static struct dentry *qnx4_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
-                           mnt);
 }
 static struct file_system_type qnx4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "qnx4",
-        .get_sb         = qnx4_get_sb,
+        .mount          = qnx4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 58703ebba879..275327b5615e 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,7 +12,6 @@
 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
 */
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "qnx4.h"
@@ -109,7 +108,6 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
        int len = dentry->d_name.len;
        struct inode *foundinode = NULL;
-        lock_kernel();
        if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino)))
                goto out;
        /* The entry is linked, let's get the real info */
@@ -123,13 +121,11 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
        foundinode = qnx4_iget(dir->i_sb, ino);
        if (IS_ERR(foundinode)) {
-                unlock_kernel();
                QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
                           PTR_ERR(foundinode)));
                return ERR_CAST(foundinode);
        }
 out:
-        unlock_kernel();
        d_add(dentry, foundinode);
        return NULL;
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 3e21b1e2ad3a..880fd9884366 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -4,6 +4,7 @@
 config QUOTA
        bool "Quota support"
+        select QUOTACTL
        help
          If you say Y here, you will be able to set per user limits for disk
          usage (also called disk quotas). Currently, it works for the
@@ -65,8 +66,7 @@ config QFMT_V2
 config QUOTACTL
        bool
-        depends on XFS_QUOTA || QUOTA
+        default n
-        default y
 config QUOTACTL_COMPAT
        bool
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index aad1316a977f..a2a622e079f0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 void __quota_error(struct super_block *sb, const char *func,
-                  const char *fmt, ...)
+                   const char *fmt, ...)
 {
-        va_list args;
        if (printk_ratelimit()) {
+                va_list args;
+                struct va_format vaf;
                va_start(args, fmt);
-                printk(KERN_ERR "Quota error (device %s): %s: ",
-                       sb->s_id, func);
+                vaf.fmt = fmt;
-                vprintk(fmt, args);
+                vaf.va = &args;
-                printk("\n");
+                printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
+                       sb->s_id, func, &vaf);
                va_end(args);
        }
 }
@@ -1386,6 +1390,9 @@ static void __dquot_initialize(struct inode *inode, int type)
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(sb, cnt))
                        continue;
+                /* We could race with quotaon or dqget() could have failed */
+                if (!got[cnt])
+                        continue;
                if (!inode->i_dquot[cnt]) {
                        inode->i_dquot[cnt] = got[cnt];
                        got[cnt] = NULL;
@@ -1736,6 +1743,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        qsize_t rsv_space = 0;
        struct dquot *transfer_from[MAXQUOTAS] = {};
        int cnt, ret = 0;
+        char is_valid[MAXQUOTAS] = {};
        char warntype_to[MAXQUOTAS];
        char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1757,8 +1765,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        space = cur_space + rsv_space;
        /* Build the transfer_from list and check the limits */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                /*
+                 * Skip changes for same uid or gid or for turned off quota-type.
+                 */
                if (!transfer_to[cnt])
                        continue;
+                /* Avoid races with quotaoff() */
+                if (!sb_has_quota_active(inode->i_sb, cnt))
+                        continue;
+                is_valid[cnt] = 1;
                transfer_from[cnt] = inode->i_dquot[cnt];
                ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
                if (ret)
@@ -1772,12 +1787,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
         * Finally perform the needed transfer from transfer_from to transfer_to
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                /*
+                if (!is_valid[cnt])
-                 * Skip changes for same uid or gid or for turned off quota-type.
-                 */
-                if (!transfer_to[cnt])
                        continue;
                /* Due to IO error we might not have transfer_from[] structure */
                if (transfer_from[cnt]) {
                        warntype_from_inodes[cnt] =
@@ -1801,18 +1812,19 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        mark_all_dquot_dirty(transfer_from);
        mark_all_dquot_dirty(transfer_to);
-        /* Pass back references to put */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_to[cnt] = transfer_from[cnt];
-warn:
        flush_warnings(transfer_to, warntype_to);
        flush_warnings(transfer_from, warntype_from_inodes);
        flush_warnings(transfer_from, warntype_from_space);
-        return ret;
+        /* Pass back references to put */
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                if (is_valid[cnt])
+                        transfer_to[cnt] = transfer_from[cnt];
+        return 0;
 over_quota:
        spin_unlock(&dq_data_lock);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        goto warn;
+        flush_warnings(transfer_to, warntype_to);
+        return ret;
 }
 EXPORT_SYMBOL(__dquot_transfer);
@@ -2177,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type)
 }
 EXPORT_SYMBOL(dquot_resume);
-int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
+int dquot_quota_on(struct super_block *sb, int type, int format_id,
-                      struct path *path)
+                   struct path *path)
 {
        int error = security_quota_on(path->dentry);
        if (error)
@@ -2192,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
                                             DQUOT_LIMITS_ENABLED);
        return error;
 }
-EXPORT_SYMBOL(dquot_quota_on_path);
-int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
-{
-        struct path path;
-        int error;
-        error = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (!error) {
-                error = dquot_quota_on_path(sb, type, format_id, &path);
-                path_put(&path);
-        }
-        return error;
-}
 EXPORT_SYMBOL(dquot_quota_on);
 /*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b299961e1edb..b34bdb25490c 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -64,18 +64,15 @@ static int quota_sync_all(int type)
 }
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
-                         void __user *addr)
+                         struct path *path)
 {
-        char *pathname;
+        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
-        int ret = -ENOSYS;
+                return -ENOSYS;
+        if (sb->s_qcop->quota_on_meta)
-        pathname = getname(addr);
+                return sb->s_qcop->quota_on_meta(sb, type, id);
-        if (IS_ERR(pathname))
+        if (IS_ERR(path))
-                return PTR_ERR(pathname);
+                return PTR_ERR(path);
-        if (sb->s_qcop->quota_on)
+        return sb->s_qcop->quota_on(sb, type, id, path);
-                ret = sb->s_qcop->quota_on(sb, type, id, pathname);
-        putname(pathname);
-        return ret;
 }
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
@@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 /* Copy parameters and call proper function */
 static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
-                       void __user *addr)
+                       void __user *addr, struct path *path)
 {
        int ret;
@@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        switch (cmd) {
        case Q_QUOTAON:
-                return quota_quotaon(sb, type, cmd, id, addr);
+                return quota_quotaon(sb, type, cmd, id, path);
        case Q_QUOTAOFF:
                if (!sb->s_qcop->quota_off)
                        return -ENOSYS;
@@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 {
        uint cmds, type;
        struct super_block *sb = NULL;
+        struct path path, *pathp = NULL;
        int ret;
        cmds = cmd >> SUBCMDSHIFT;
@@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
                return -ENODEV;
        }
+        /*
+         * Path for quotaon has to be resolved before grabbing superblock
+         * because that gets s_umount sem which is also possibly needed by path
+         * resolution (think about autofs) and thus deadlocks could arise.
+         */
+        if (cmds == Q_QUOTAON) {
+                ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
+                if (ret)
+                        pathp = ERR_PTR(ret);
+                else
+                        pathp = &path;
+        }
        sb = quotactl_block(special);
        if (IS_ERR(sb))
                return PTR_ERR(sb);
-        ret = do_quotactl(sb, type, cmds, id, addr);
+        ret = do_quotactl(sb, type, cmds, id, addr, pathp);
        drop_super(sb);
+        if (pathp && !IS_ERR(pathp))
+                path_put(pathp);
        return ret;
 }
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 9e48874eabcc..e41c1becf096 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                return -ENOMEM;
        ret = read_blk(info, *blk, buf);
        if (ret < 0) {
-                quota_error(dquot->dq_sb, "Can't read quota data "
+                quota_error(dquot->dq_sb, "Can't read quota data block %u",
-                            "block %u", blk);
+                            *blk);
                goto out_buf;
        }
        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                } else {
                        ret = write_blk(info, *blk, buf);
                        if (ret < 0)
-                                quota_error(dquot->dq_sb, "Can't write quota "
+                                quota_error(dquot->dq_sb,
-                                            "tree block %u", blk);
+                                            "Can't write quota tree block %u",
+                                            *blk);
                }
        }
 out_buf:
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae70dc6d..eacb166fb259 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
        struct inode * inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
@@ -254,17 +255,16 @@ fail:
        return err;
 }
-int ramfs_get_sb(struct file_system_type *fs_type,
+struct dentry *ramfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, ramfs_fill_super);
 }
-static int rootfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *rootfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
+        return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
-                            mnt);
 }
 static void ramfs_kill_sb(struct super_block *sb)
@@ -275,12 +275,12 @@ static void ramfs_kill_sb(struct super_block *sb)
 static struct file_system_type ramfs_fs_type = {
        .name           = "ramfs",
-        .get_sb         = ramfs_get_sb,
+        .mount          = ramfs_mount,
        .kill_sb        = ramfs_kill_sb,
 };
 static struct file_system_type rootfs_fs_type = {
        .name           = "rootfs",
-        .get_sb         = rootfs_get_sb,
+        .mount          = rootfs_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/read_write.c b/fs/read_write.c
index 74e36586e4d3..5520f8ad5504 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/uio.h>
-#include <linux/smp_lock.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/module.h>
@@ -31,6 +30,11 @@ const struct file_operations generic_ro_fops = {
 EXPORT_SYMBOL(generic_ro_fops);
+static inline int unsigned_offsets(struct file *file)
+{
+        return file->f_mode & FMODE_UNSIGNED_OFFSET;
+}
 /**
 * generic_file_llseek_unlocked - lockless generic llseek implementation
 * @file:       file structure to seek on
@@ -62,7 +66,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                break;
        }
-        if (offset < 0 || offset > inode->i_sb->s_maxbytes)
+        if (offset < 0 && !unsigned_offsets(file))
+                return -EINVAL;
+        if (offset > inode->i_sb->s_maxbytes)
                return -EINVAL;
        /* Special lock needed here? */
@@ -124,7 +130,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 {
        loff_t retval;
-        lock_kernel();
+        mutex_lock(&file->f_dentry->d_inode->i_mutex);
        switch (origin) {
                case SEEK_END:
                        offset += i_size_read(file->f_path.dentry->d_inode);
@@ -137,7 +143,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                        offset += file->f_pos;
        }
        retval = -EINVAL;
-        if (offset >= 0) {
+        if (offset >= 0 || unsigned_offsets(file)) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
@@ -145,7 +151,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                retval = offset;
        }
 out:
-        unlock_kernel();
+        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
        return retval;
 }
 EXPORT_SYMBOL(default_llseek);
@@ -156,7 +162,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
        fn = no_llseek;
        if (file->f_mode & FMODE_LSEEK) {
-                fn = default_llseek;
                if (file->f_op && file->f_op->llseek)
                        fn = file->f_op->llseek;
        }
@@ -222,13 +227,12 @@ bad:
 }
 #endif
 /*
 * rw_verify_area doesn't like huge counts. We limit
 * them to something that fits in "int" so that others
 * won't have to do range checks all the time.
 */
-#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
 {
        struct inode *inode;
@@ -239,8 +243,15 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
        if (unlikely((ssize_t) count < 0))
                return retval;
        pos = *ppos;
-        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
+        if (unlikely(pos < 0)) {
-                return retval;
+                if (!unsigned_offsets(file))
+                        return retval;
+                if (count >= -pos) /* both values are in 0..LLONG_MAX */
+                        return -EOVERFLOW;
+        } else if (unlikely((loff_t) (pos + count) < 0)) {
+                if (!unsigned_offsets(file))
+                        return retval;
+        }
        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
                retval = locks_mandatory_area(
@@ -565,65 +576,71 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
                              unsigned long nr_segs, unsigned long fast_segs,
                              struct iovec *fast_pointer,
                              struct iovec **ret_pointer)
-  {
+{
        unsigned long seg;
-        ssize_t ret;
+        ssize_t ret;
        struct iovec *iov = fast_pointer;
-        /*
+        /*
-         * SuS says "The readv() function *may* fail if the iovcnt argument
+         * SuS says "The readv() function *may* fail if the iovcnt argument
-         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-         * traditionally returned zero for zero segments, so...
+         * traditionally returned zero for zero segments, so...
-         */
+         */
        if (nr_segs == 0) {
                ret = 0;
-                goto out;
+                goto out;
        }
-        /*
+        /*
-         * First get the "struct iovec" from user memory and
+         * First get the "struct iovec" from user memory and
-         * verify all the pointers
+         * verify all the pointers
-         */
+         */
        if (nr_segs > UIO_MAXIOV) {
                ret = -EINVAL;
-                goto out;
+                goto out;
        }
        if (nr_segs > fast_segs) {
-                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
                if (iov == NULL) {
                        ret = -ENOMEM;
-                        goto out;
+                        goto out;
                }
-        }
+        }
        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
                ret = -EFAULT;
-                goto out;
+                goto out;
        }
-        /*
+        /*
         * According to the Single Unix Specification we should return EINVAL
         * if an element length is < 0 when cast to ssize_t or if the
         * total length would overflow the ssize_t return value of the
         * system call.
-         */
+         *
+         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
+         * overflow case.
+         */
        ret = 0;
-        for (seg = 0; seg < nr_segs; seg++) {
+        for (seg = 0; seg < nr_segs; seg++) {
-                void __user *buf = iov[seg].iov_base;
+                void __user *buf = iov[seg].iov_base;
-                ssize_t len = (ssize_t)iov[seg].iov_len;
+                ssize_t len = (ssize_t)iov[seg].iov_len;
                /* see if we we're about to use an invalid len or if
                 * it's about to overflow ssize_t */
-                if (len < 0 || (ret + len < ret)) {
+                if (len < 0) {
                        ret = -EINVAL;
-                        goto out;
+                        goto out;
                }
                if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
                        ret = -EFAULT;
-                        goto out;
+                        goto out;
+                }
+                if (len > MAX_RW_COUNT - ret) {
+                        len = MAX_RW_COUNT - ret;
+                        iov[seg].iov_len = len;
                }
                ret += len;
-        }
+        }
 out:
        *ret_pointer = iov;
        return ret;
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 513f431038f9..7cd46666ba2c 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -10,7 +10,8 @@ config REISERFS_FS
          In general, ReiserFS is as fast as ext2, but is very efficient with
          large directories and small files.  Additional patches are needed
-          for NFS and quotas, please see <http://www.namesys.com/> for links.
+          for NFS and quotas, please see 
+          <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
          It is more easily extended to have features currently found in
          database and keyword search systems than block allocation based file
@@ -18,7 +19,8 @@ config REISERFS_FS
          plugins consistent with our motto ``It takes more than a license to
          make source code open.''
-          Read <http://www.namesys.com/> to learn more about reiserfs.
+          Read <https://reiser4.wiki.kernel.org/index.php/Main_Page> 
+          to learn more about reiserfs.
          Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 14e8c9d460e5..e2f7a264e3ff 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -43,7 +43,7 @@ to address the fair crediting issue in the next GPL version.)
 [END LICENSING]
 Reiserfs is a file system based on balanced tree algorithms, which is
-described at http://devlinux.com/namesys.
+described at https://reiser4.wiki.kernel.org/index.php/Main_Page 
 Stop reading here.  Go there, then return.
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6846371498b6..91f080cc76c8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -152,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
        barrier_done = reiserfs_commit_for_inode(inode);
        reiserfs_write_unlock(inode->i_sb);
        if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
        if (barrier_done < 0)
                return barrier_done;
        return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index caa758377d66..0bae036831e2 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -8,7 +8,6 @@
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
@@ -22,8 +21,6 @@
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 void reiserfs_evict_inode(struct inode *inode)
 {
@@ -165,7 +162,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
 ** but tail is still sitting in a direct item, and we can't write to
 ** it.  So, look through this page, and check all the mapped buffers
 ** to make sure they have valid block numbers.  Any that don't need
-** to be unmapped, so that block_prepare_write will correctly call
+** to be unmapped, so that __block_write_begin will correctly call
 ** reiserfs_get_block to convert the tail into an unformatted node
 */
 static inline void fix_tail_page_for_writing(struct page *page)
@@ -439,13 +436,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
 }
 /* special version of get_block that is only used by grab_tail_page right
-** now.  It is sent to block_prepare_write, and when you try to get a
+** now.  It is sent to __block_write_begin, and when you try to get a
 ** block past the end of the file (or a block from a hole) it returns
-** -ENOENT instead of a valid buffer.  block_prepare_write expects to
+** -ENOENT instead of a valid buffer.  __block_write_begin expects to
 ** be able to do i/o on the buffers returned, unless an error value
 ** is also returned.
 **
-** So, this allows block_prepare_write to be used for reading a single block
+** So, this allows __block_write_begin to be used for reading a single block
 ** in a page.  Where it does not produce a valid page for holes, or past the
 ** end of the file.  This turns out to be exactly what we need for reading
 ** tails for conversion.
@@ -558,11 +555,12 @@ static int convert_tail_for_hole(struct inode *inode,
         **
         ** We must fix the tail page for writing because it might have buffers
         ** that are mapped, but have a block number of 0.  This indicates tail
-         ** data that has been read directly into the page, and block_prepare_write
+         ** data that has been read directly into the page, and
-         ** won't trigger a get_block in this case.
+         ** __block_write_begin won't trigger a get_block in this case.
         */
        fix_tail_page_for_writing(tail_page);
-        retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+        retval = __reiserfs_write_begin(tail_page, tail_start,
+                                      tail_end - tail_start);
        if (retval)
                goto unlock;
@@ -2033,7 +2031,7 @@ static int grab_tail_page(struct inode *inode,
        /* start within the page of the last block in the file */
        start = (offset / blocksize) * blocksize;
-        error = block_prepare_write(page, start, offset,
+        error = __block_write_begin(page, start, offset - start,
                                    reiserfs_get_block_create_0);
        if (error)
                goto unlock;
@@ -2438,7 +2436,7 @@ static int reiserfs_write_full_page(struct page *page,
                /* from this point on, we know the buffer is mapped to a
                 * real block and not a direct item
                 */
-                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else {
                        if (!trylock_buffer(bh)) {
@@ -2628,8 +2626,7 @@ static int reiserfs_write_begin(struct file *file,
        return ret;
 }
-int reiserfs_prepare_write(struct file *f, struct page *page,
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
-                           unsigned from, unsigned to)
 {
        struct inode *inode = page->mapping->host;
        int ret;
@@ -2650,7 +2647,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
                th->t_refcount++;
        }
-        ret = block_prepare_write(page, from, to, reiserfs_get_block);
+        ret = __block_write_begin(page, from, len, reiserfs_get_block);
        if (ret && reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th = current->journal_info;
                /* this gets a little ugly.  If reiserfs_get_block returned an
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 5cbb81e134ac..79265fdc317a 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -9,7 +9,6 @@
 #include <linux/time.h>
 #include <asm/uaccess.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/compat.h>
 /*
@@ -160,8 +159,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 /*
 ** reiserfs_unpack
 ** Function try to convert tail from direct item into indirect.
@@ -186,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
                return 0;
        }
-        /* we need to make sure nobody is changing the file size beneath
-         ** us
-         */
-        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
        depth = reiserfs_write_lock_once(inode->i_sb);
+        /* we need to make sure nobody is changing the file size beneath us */
+        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
        write_from = inode->i_size & (blocksize - 1);
        /* if we are on a block boundary, we are already unpacked.  */
        if (write_from == 0) {
@@ -200,7 +196,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        }
        /* we unpack by finding the page with the tail, and calling
-         ** reiserfs_prepare_write on that page.  This will force a
+         ** __reiserfs_write_begin on that page.  This will force a
         ** reiserfs_get_block to unpack the tail for us.
         */
        index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -210,7 +206,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        if (!page) {
                goto out;
        }
-        retval = reiserfs_prepare_write(NULL, page, write_from, write_from);
+        retval = __reiserfs_write_begin(page, write_from, 0);
        if (retval)
                goto out_unlock;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 812e2c05aa29..3eea859e6990 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -43,7 +43,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
@@ -138,13 +137,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
        return 0;
 }
-static void disable_barrier(struct super_block *s)
-{
-        REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
-        printk("reiserfs: disabling flush barriers on %s\n",
-               reiserfs_bdevname(s));
-}
 static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
                                                         *sb)
 {
@@ -677,30 +669,6 @@ static void submit_ordered_buffer(struct buffer_head *bh)
        submit_bh(WRITE, bh);
 }
-static int submit_barrier_buffer(struct buffer_head *bh)
-{
-        get_bh(bh);
-        bh->b_end_io = reiserfs_end_ordered_io;
-        clear_buffer_dirty(bh);
-        if (!buffer_uptodate(bh))
-                BUG();
-        return submit_bh(WRITE_BARRIER, bh);
-}
-static void check_barrier_completion(struct super_block *s,
-                                     struct buffer_head *bh)
-{
-        if (buffer_eopnotsupp(bh)) {
-                clear_buffer_eopnotsupp(bh);
-                disable_barrier(s);
-                set_buffer_uptodate(bh);
-                set_buffer_dirty(bh);
-                reiserfs_write_unlock(s);
-                sync_dirty_buffer(bh);
-                reiserfs_write_lock(s);
-        }
-}
 #define CHUNK_SIZE 32
 struct buffer_chunk {
        struct buffer_head *bh[CHUNK_SIZE];
@@ -1009,7 +977,6 @@ static int flush_commit_list(struct super_block *s,
        struct buffer_head *tbh = NULL;
        unsigned int trans_id = jl->j_trans_id;
        struct reiserfs_journal *journal = SB_JOURNAL(s);
-        int barrier = 0;
        int retval = 0;
        int write_len;
@@ -1094,24 +1061,6 @@ static int flush_commit_list(struct super_block *s,
        }
        atomic_dec(&journal->j_async_throttle);
-        /* We're skipping the commit if there's an error */
-        if (retval || reiserfs_is_journal_aborted(journal))
-                barrier = 0;
-        /* wait on everything written so far before writing the commit
-         * if we are in barrier mode, send the commit down now
-         */
-        barrier = reiserfs_barrier_flush(s);
-        if (barrier) {
-                int ret;
-                lock_buffer(jl->j_commit_bh);
-                ret = submit_barrier_buffer(jl->j_commit_bh);
-                if (ret == -EOPNOTSUPP) {
-                        set_buffer_uptodate(jl->j_commit_bh);
-                        disable_barrier(s);
-                        barrier = 0;
-                }
-        }
        for (i = 0; i < (jl->j_len + 1); i++) {
                bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
                    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
@@ -1143,27 +1092,22 @@ static int flush_commit_list(struct super_block *s,
        BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
-        if (!barrier) {
+        /* If there was a write error in the journal - we can't commit
-                /* If there was a write error in the journal - we can't commit
+         * this transaction - it will be invalid and, if successful,
-                 * this transaction - it will be invalid and, if successful,
+         * will just end up propagating the write error out to
-                 * will just end up propagating the write error out to
+         * the file system. */
-                 * the file system. */
+        if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
-                if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
+                if (buffer_dirty(jl->j_commit_bh))
-                        if (buffer_dirty(jl->j_commit_bh))
+                        BUG();
-                                BUG();
+                mark_buffer_dirty(jl->j_commit_bh) ;
-                        mark_buffer_dirty(jl->j_commit_bh) ;
-                        reiserfs_write_unlock(s);
-                        sync_dirty_buffer(jl->j_commit_bh) ;
-                        reiserfs_write_lock(s);
-                }
-        } else {
                reiserfs_write_unlock(s);
-                wait_on_buffer(jl->j_commit_bh);
+                if (reiserfs_barrier_flush(s))
+                        __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+                else
+                        sync_dirty_buffer(jl->j_commit_bh);
                reiserfs_write_lock(s);
        }
-        check_barrier_completion(s, jl->j_commit_bh);
        /* If there was a write error in the journal - we can't commit this
         * transaction - it will be invalid and, if successful, will just end
         * up propagating the write error out to the filesystem. */
@@ -1319,26 +1263,15 @@ static int _update_journal_header_block(struct super_block *sb,
                jh->j_first_unflushed_offset = cpu_to_le32(offset);
                jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
-                if (reiserfs_barrier_flush(sb)) {
+                set_buffer_dirty(journal->j_header_bh);
-                        int ret;
+                reiserfs_write_unlock(sb);
-                        lock_buffer(journal->j_header_bh);
-                        ret = submit_barrier_buffer(journal->j_header_bh);
+                if (reiserfs_barrier_flush(sb))
-                        if (ret == -EOPNOTSUPP) {
+                        __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
-                                set_buffer_uptodate(journal->j_header_bh);
+                else
-                                disable_barrier(sb);
-                                goto sync;
-                        }
-                        reiserfs_write_unlock(sb);
-                        wait_on_buffer(journal->j_header_bh);
-                        reiserfs_write_lock(sb);
-                        check_barrier_completion(sb, journal->j_header_bh);
-                } else {
-                      sync:
-                        set_buffer_dirty(journal->j_header_bh);
-                        reiserfs_write_unlock(sb);
                        sync_dirty_buffer(journal->j_header_bh);
-                        reiserfs_write_lock(sb);
-                }
+                reiserfs_write_lock(sb);
                if (!buffer_uptodate(journal->j_header_bh)) {
                        reiserfs_warning(sb, "journal-837",
                                         "IO error during journal replay");
@@ -2618,8 +2551,6 @@ static int release_journal_dev(struct super_block *super,
        result = 0;
        if (journal->j_dev_bd != NULL) {
-                if (journal->j_dev_bd->bd_dev != super->s_dev)
-                        bd_release(journal->j_dev_bd);
                result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
                journal->j_dev_bd = NULL;
        }
@@ -2637,7 +2568,7 @@ static int journal_init_dev(struct super_block *super,
 {
        int result;
        dev_t jdev;
-        fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
+        fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
        char b[BDEVNAME_SIZE];
        result = 0;
@@ -2651,7 +2582,10 @@ static int journal_init_dev(struct super_block *super,
        /* there is no "jdev" option and journal is on separate device */
        if ((!jdev_name || !jdev_name[0])) {
-                journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+                if (jdev == super->s_dev)
+                        blkdev_mode &= ~FMODE_EXCL;
+                journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
+                                                      journal);
                journal->j_dev_mode = blkdev_mode;
                if (IS_ERR(journal->j_dev_bd)) {
                        result = PTR_ERR(journal->j_dev_bd);
@@ -2660,22 +2594,14 @@ static int journal_init_dev(struct super_block *super,
                                         "cannot init journal device '%s': %i",
                                         __bdevname(jdev, b), result);
                        return result;
-                } else if (jdev != super->s_dev) {
+                } else if (jdev != super->s_dev)
-                        result = bd_claim(journal->j_dev_bd, journal);
-                        if (result) {
-                                blkdev_put(journal->j_dev_bd, blkdev_mode);
-                                return result;
-                        }
                        set_blocksize(journal->j_dev_bd, super->s_blocksize);
-                }
                return 0;
        }
        journal->j_dev_mode = blkdev_mode;
-        journal->j_dev_bd = open_bdev_exclusive(jdev_name,
+        journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
-                                                blkdev_mode, journal);
        if (IS_ERR(journal->j_dev_bd)) {
                result = PTR_ERR(journal->j_dev_bd);
                journal->j_dev_bd = NULL;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a0086a..ba5f51ec3458 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        reiserfs_update_sd(&th, inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
        reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index adbc6f538515..45de98b59466 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh, ...)	//int print_mode, int first, int l
        va_list args;
        int mode, first, last;
-        va_start(args, bh);
        if (!bh) {
                printk("print_block: buffer is NULL\n");
                return;
        }
+        va_start(args, bh);
        mode = va_arg(args, int);
        first = va_arg(args, int);
        last = va_arg(args, int);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e15ff612002d..0aab04f46827 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 struct file_system_type reiserfs_fs_type;
@@ -530,11 +529,18 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void reiserfs_destroy_inode(struct inode *inode)
+static void reiserfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
+static void reiserfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, reiserfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
@@ -626,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *);
+static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
 static const struct dquot_operations reiserfs_quota_operations = {
        .write_dquot = reiserfs_write_dquot,
@@ -2042,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
 * Standard function to be called on quota_on
 */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-                             char *name)
+                             struct path *path)
 {
        int err;
-        struct path path;
        struct inode *inode;
        struct reiserfs_transaction_handle th;
        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
                return -EINVAL;
-        err = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (err)
-                return err;
        /* Quotafile not on the same filesystem? */
-        if (path.mnt->mnt_sb != sb) {
+        if (path->mnt->mnt_sb != sb) {
                err = -EXDEV;
                goto out;
        }
-        inode = path.dentry->d_inode;
+        inode = path->dentry->d_inode;
        /* We must not pack tails for quota files on reiserfs for quota IO to work */
        if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
                err = reiserfs_unpack(inode, NULL);
@@ -2076,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
        /* Journaling quota? */
        if (REISERFS_SB(sb)->s_qf_names[type]) {
                /* Quotafile not of fs root? */
-                if (path.dentry->d_parent != sb->s_root)
+                if (path->dentry->d_parent != sb->s_root)
                        reiserfs_warning(sb, "super-6521",
                                 "Quota file not on filesystem root. "
                                 "Journalled quota will not work.");
@@ -2095,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                if (err)
                        goto out;
        }
-        err = dquot_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on(sb, type, format_id, path);
 out:
-        path_put(&path);
        return err;
 }
@@ -2213,12 +2214,11 @@ out:
 #endif
-static int get_super_block(struct file_system_type *fs_type,
+static struct dentry *get_super_block(struct file_system_type *fs_type,
                           int flags, const char *dev_name,
-                           void *data, struct vfsmount *mnt)
+                           void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
-                           mnt);
 }
 static int __init init_reiserfs_fs(void)
@@ -2253,7 +2253,7 @@ static void __exit exit_reiserfs_fs(void)
 struct file_system_type reiserfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "reiserfs",
-        .get_sb = get_super_block,
+        .mount = get_super_block,
        .kill_sb = reiserfs_kill_sb,
        .fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf273c672..3cfb2e933644 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,13 +418,11 @@ static inline __u32 xattr_hash(const char *msg, int len)
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 static void update_ctime(struct inode *inode)
 {
        struct timespec now = current_fs_time(inode->i_sb);
-        if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink ||
+        if (inode_unhashed(inode) || !inode->i_nlink ||
            timespec_equal(&inode->i_ctime, &now))
                return;
@@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
                        rxh->h_hash = cpu_to_le32(xahash);
                }
-                err = reiserfs_prepare_write(NULL, page, page_offset,
+                err = __reiserfs_write_begin(page, page_offset, chunk + skip);
-                                            page_offset + chunk + skip);
                if (!err) {
                        if (buffer)
                                memcpy(data + skip, buffer + buffer_pos, chunk);
@@ -873,11 +870,14 @@ out:
        return err;
 }
-static int reiserfs_check_acl(struct inode *inode, int mask)
+static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int error = -EAGAIN; /* do regular unix permission checks by default */
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (acl) {
@@ -954,8 +954,10 @@ static int xattr_mount_check(struct super_block *s)
        return 0;
 }
-int reiserfs_permission(struct inode *inode, int mask)
+int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        /*
         * We don't do permission checks on the internal objects.
         * Permissions are determined by the "owning" object.
@@ -968,13 +970,16 @@ int reiserfs_permission(struct inode *inode, int mask)
         * Stat data v1 doesn't support ACLs.
         */
        if (get_inode_sd_version(inode) != STAT_DATA_V1)
-                return generic_permission(inode, mask, reiserfs_check_acl);
+                return generic_permission(inode, mask, flags,
+                                        reiserfs_check_acl);
 #endif
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        return -EPERM;
 }
@@ -993,7 +998,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
                REISERFS_SB(s)->priv_root = dentry;
-                dentry->d_op = &xattr_lookup_poison_ops;
+                d_set_d_op(dentry, &xattr_lookup_poison_ops);
                if (dentry->d_inode)
                        dentry->d_inode->i_flags |= S_PRIVATE;
        } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 536d697a8a28..90d2fcb67a31 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode)
                struct reiserfs_transaction_handle th;
                size_t size = reiserfs_xattr_nblocks(inode,
                                             reiserfs_acl_size(clone->a_count));
-                reiserfs_write_lock(inode->i_sb);
+                int depth;
+                depth = reiserfs_write_lock_once(inode->i_sb);
                error = journal_begin(&th, inode->i_sb, size * 2);
                if (!error) {
                        int error2;
@@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode)
                        if (error2)
                                error = error2;
                }
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, depth);
        }
        posix_acl_release(clone);
        return error;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 42d213546894..2305e3121cb1 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -282,6 +282,7 @@ error:
 static const struct file_operations romfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = romfs_readdir,
+        .llseek         = default_llseek,
 };
 static const struct inode_operations romfs_dir_inode_operations = {
@@ -399,11 +400,18 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
 /*
 * return a spent inode to the slab cache
 */
-static void romfs_destroy_inode(struct inode *inode)
+static void romfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
+static void romfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, romfs_i_callback);
+}
 /*
 * get filesystem statistics
 */
@@ -551,20 +559,19 @@ error_rsb:
 /*
 * get a superblock for mounting
 */
-static int romfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *romfs_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        int ret = -EINVAL;
+        struct dentry *ret = ERR_PTR(-EINVAL);
 #ifdef CONFIG_ROMFS_ON_MTD
-        ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
+        ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
-                         mnt);
 #endif
 #ifdef CONFIG_ROMFS_ON_BLOCK
-        if (ret == -EINVAL)
+        if (ret == ERR_PTR(-EINVAL))
-                ret = get_sb_bdev(fs_type, flags, dev_name, data,
+                ret = mount_bdev(fs_type, flags, dev_name, data,
-                                  romfs_fill_super, mnt);
+                                  romfs_fill_super);
 #endif
        return ret;
 }
@@ -591,7 +598,7 @@ static void romfs_kill_sb(struct super_block *sb)
 static struct file_system_type romfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "romfs",
-        .get_sb         = romfs_get_sb,
+        .mount          = romfs_mount,
        .kill_sb        = romfs_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/select.c b/fs/select.c
index 500a669f7790..e56560d2b08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv)
        return slack;
 }
-static long estimate_accuracy(struct timespec *tv)
+long select_estimate_accuracy(struct timespec *tv)
 {
        unsigned long ret;
        struct timespec now;
@@ -306,6 +306,8 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
                rts.tv_sec = rts.tv_nsec = 0;
        if (timeval) {
+                if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
+                        memset(&rtv, 0, sizeof(rtv));
                rtv.tv_sec = rts.tv_sec;
                rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
@@ -417,7 +419,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        }
        if (end_time && !timed_out)
-                slack = estimate_accuracy(end_time);
+                slack = select_estimate_accuracy(end_time);
        retval = 0;
        for (;;) {
@@ -769,7 +771,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
        }
        if (end_time && !timed_out)
-                slack = estimate_accuracy(end_time);
+                slack = select_estimate_accuracy(end_time);
        for (;;) {
                struct poll_list *walk;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e1f437be6c3c..05d6b0e78c95 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -131,7 +131,7 @@ Eoverflow:
 */
 ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 {
-        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct seq_file *m = file->private_data;
        size_t copied = 0;
        loff_t pos;
        size_t n;
@@ -280,7 +280,7 @@ EXPORT_SYMBOL(seq_read);
 */
 loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 {
-        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct seq_file *m = file->private_data;
        loff_t retval = -EINVAL;
        mutex_lock(&m->lock);
@@ -324,7 +324,7 @@ EXPORT_SYMBOL(seq_lseek);
 */
 int seq_release(struct inode *inode, struct file *file)
 {
-        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct seq_file *m = file->private_data;
        kfree(m->buf);
        kfree(m);
        return 0;
@@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
        if (size) {
                char *p;
-                spin_lock(&dcache_lock);
                p = __d_path(path, root, buf, size);
-                spin_unlock(&dcache_lock);
                res = PTR_ERR(p);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1c5a6add779d..492465b451dd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 #ifdef __ARCH_SI_TRAPNO
                err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
 #endif
+#ifdef BUS_MCEERR_AO
+                /* 
+                 * Other callers might not initialize the si_lsb field,
+                 * so check explicitly for the right codes here.
+                 */
+                if (kinfo->si_code == BUS_MCEERR_AR ||
+                    kinfo->si_code == BUS_MCEERR_AO)
+                        err |= __put_user((short) kinfo->si_addr_lsb,
+                                          &uinfo->ssi_addr_lsb);
+#endif
                break;
        case __SI_CHLD:
                err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
@@ -206,6 +216,7 @@ static const struct file_operations signalfd_fops = {
        .release        = signalfd_release,
        .poll           = signalfd_poll,
        .read           = signalfd_read,
+        .llseek         = noop_llseek,
 };
 SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
deleted file mode 100644
index e668127c8b2e..000000000000
--- a/fs/smbfs/Kconfig
+++ /dev/null
@@ -1,55 +0,0 @@
-config SMB_FS
-        tristate "SMB file system support (OBSOLETE, please use CIFS)"
-        depends on INET
-        select NLS
-        help
-          SMB (Server Message Block) is the protocol Windows for Workgroups
-          (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
-          files and printers over local networks.  Saying Y here allows you to
-          mount their file systems (often called "shares" in this context) and
-          access them just like any other Unix directory.  Currently, this
-          works only if the Windows machines use TCP/IP as the underlying
-          transport protocol, and not NetBEUI.  For details, read
-          <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
-          available from <http://www.tldp.org/docs.html#howto>.
-          Note: if you just want your box to act as an SMB *server* and make
-          files and printing services available to Windows clients (which need
-          to have a TCP/IP stack), you don't need to say Y here; you can use
-          the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
-          for that.
-          General information about how to connect Linux, Windows machines and
-          Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-          To compile the SMB support as a module, choose M here:
-          the module will be called smbfs.  Most people say N, however.
-config SMB_NLS_DEFAULT
-        bool "Use a default NLS"
-        depends on SMB_FS
-        help
-          Enabling this will make smbfs use nls translations by default. You
-          need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
-          settings and you need to give the default nls for the SMB server as
-          CONFIG_SMB_NLS_REMOTE.
-          The nls settings can be changed at mount time, if your smbmount
-          supports that, using the codepage and iocharset parameters.
-          smbmount from samba 2.2.0 or later supports this.
-config SMB_NLS_REMOTE
-        string "Default Remote NLS Option"
-        depends on SMB_NLS_DEFAULT
-        default "cp437"
-        help
-          This setting allows you to specify a default value for which
-          codepage the server uses. If this field is left blank no
-          translations will be done by default. The local codepage/charset
-          default to CONFIG_NLS_DEFAULT.
-          The nls settings can be changed at mount time, if your smbmount
-          supports that, using the codepage and iocharset parameters.
-          smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
deleted file mode 100644
index 4faf8c4722c3..000000000000
--- a/fs/smbfs/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# Makefile for the linux smb-filesystem routines.
-#
-obj-$(CONFIG_SMB_FS) += smbfs.o
-smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
-                symlink.o smbiod.o request.o
-# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
-# SMBFS_PARANOIA should normally be enabled.
-EXTRA_CFLAGS += -DSMBFS_PARANOIA
-#EXTRA_CFLAGS += -DSMBFS_DEBUG
-#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
-#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
-#EXTRA_CFLAGS += -Werror
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
deleted file mode 100644
index 8c177eb7e344..000000000000
--- a/fs/smbfs/cache.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- *  cache.c
- *
- * Copyright (C) 1997 by Bill Hawes
- *
- * Routines to support directory cacheing using the page cache.
- * This cache code is almost directly taken from ncpfs.
- *
- * Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smb_fs.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-#include <asm/page.h>
-#include "smb_debug.h"
-#include "proto.h"
-/*
- * Force the next attempt to use the cache to be a timeout.
- * If we can't find the page that's fine, it will cause a refresh.
- */
-void
-smb_invalid_dir_cache(struct inode * dir)
-{
-        struct smb_sb_info *server = server_from_inode(dir);
-        union  smb_dir_cache *cache = NULL;
-        struct page *page = NULL;
-        page = grab_cache_page(&dir->i_data, 0);
-        if (!page)
-                goto out;
-        if (!PageUptodate(page))
-                goto out_unlock;
-        cache = kmap(page);
-        cache->head.time = jiffies - SMB_MAX_AGE(server);
-        kunmap(page);
-        SetPageUptodate(page);
-out_unlock:
-        unlock_page(page);
-        page_cache_release(page);
-out:
-        return;
-}
-/*
- * Mark all dentries for 'parent' as invalid, forcing them to be re-read
- */
-void
-smb_invalidate_dircache_entries(struct dentry *parent)
-{
-        struct smb_sb_info *server = server_from_dentry(parent);
-        struct list_head *next;
-        struct dentry *dentry;
-        spin_lock(&dcache_lock);
-        next = parent->d_subdirs.next;
-        while (next != &parent->d_subdirs) {
-                dentry = list_entry(next, struct dentry, d_u.d_child);
-                dentry->d_fsdata = NULL;
-                smb_age_dentry(server, dentry);
-                next = next->next;
-        }
-        spin_unlock(&dcache_lock);
-}
-/*
- * dget, but require that fpos and parent matches what the dentry contains.
- * dentry is not known to be a valid pointer at entry.
- */
-struct dentry *
-smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
-{
-        struct dentry *dent = dentry;
-        struct list_head *next;
-        if (d_validate(dent, parent)) {
-                if (dent->d_name.len <= SMB_MAXNAMELEN &&
-                    (unsigned long)dent->d_fsdata == fpos) {
-                        if (!dent->d_inode) {
-                                dput(dent);
-                                dent = NULL;
-                        }
-                        return dent;
-                }
-                dput(dent);
-        }
-        /* If a pointer is invalid, we search the dentry. */
-        spin_lock(&dcache_lock);
-        next = parent->d_subdirs.next;
-        while (next != &parent->d_subdirs) {
-                dent = list_entry(next, struct dentry, d_u.d_child);
-                if ((unsigned long)dent->d_fsdata == fpos) {
-                        if (dent->d_inode)
-                                dget_locked(dent);
-                        else
-                                dent = NULL;
-                        goto out_unlock;
-                }
-                next = next->next;
-        }
-        dent = NULL;
-out_unlock:
-        spin_unlock(&dcache_lock);
-        return dent;
-}
-/*
- * Create dentry/inode for this file and add it to the dircache.
- */
-int
-smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-               struct smb_cache_control *ctrl, struct qstr *qname,
-               struct smb_fattr *entry)
-{
-        struct dentry *newdent, *dentry = filp->f_path.dentry;
-        struct inode *newino, *inode = dentry->d_inode;
-        struct smb_cache_control ctl = *ctrl;
-        int valid = 0;
-        int hashed = 0;
-        ino_t ino = 0;
-        qname->hash = full_name_hash(qname->name, qname->len);
-        if (dentry->d_op && dentry->d_op->d_hash)
-                if (dentry->d_op->d_hash(dentry, qname) != 0)
-                        goto end_advance;
-        newdent = d_lookup(dentry, qname);
-        if (!newdent) {
-                newdent = d_alloc(dentry, qname);
-                if (!newdent)
-                        goto end_advance;
-        } else {
-                hashed = 1;
-                memcpy((char *) newdent->d_name.name, qname->name,
-                       newdent->d_name.len);
-        }
-        if (!newdent->d_inode) {
-                smb_renew_times(newdent);
-                entry->f_ino = iunique(inode->i_sb, 2);
-                newino = smb_iget(inode->i_sb, entry);
-                if (newino) {
-                        smb_new_dentry(newdent);
-                        d_instantiate(newdent, newino);
-                        if (!hashed)
-                                d_rehash(newdent);
-                }
-        } else
-                smb_set_inode_attr(newdent->d_inode, entry);
-        if (newdent->d_inode) {
-                ino = newdent->d_inode->i_ino;
-                newdent->d_fsdata = (void *) ctl.fpos;
-                smb_new_dentry(newdent);
-        }
-        if (ctl.idx >= SMB_DIRCACHE_SIZE) {
-                if (ctl.page) {
-                        kunmap(ctl.page);
-                        SetPageUptodate(ctl.page);
-                        unlock_page(ctl.page);
-                        page_cache_release(ctl.page);
-                }
-                ctl.cache = NULL;
-                ctl.idx  -= SMB_DIRCACHE_SIZE;
-                ctl.ofs  += 1;
-                ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
-                if (ctl.page)
-                        ctl.cache = kmap(ctl.page);
-        }
-        if (ctl.cache) {
-                ctl.cache->dentry[ctl.idx] = newdent;
-                valid = 1;
-        }
-        dput(newdent);
-end_advance:
-        if (!valid)
-                ctl.valid = 0;
-        if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
-                if (!ino)
-                        ino = find_inode_number(dentry, qname);
-                if (!ino)
-                        ino = iunique(inode->i_sb, 2);
-                ctl.filled = filldir(dirent, qname->name, qname->len,
-                                     filp->f_pos, ino, DT_UNKNOWN);
-                if (!ctl.filled)
-                        filp->f_pos += 1;
-        }
-        ctl.fpos += 1;
-        ctl.idx  += 1;
-        *ctrl = ctl;
-        return (ctl.valid || !ctl.filled);
-}
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
deleted file mode 100644
index 00a70cab1f36..000000000000
--- a/fs/smbfs/dir.c
+++ /dev/null
@@ -1,702 +0,0 @@
-/*
- *  dir.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/smp_lock.h>
-#include <linux/ctype.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-#include <linux/smbno.h>
-#include "smb_debug.h"
-#include "proto.h"
-static int smb_readdir(struct file *, void *, filldir_t);
-static int smb_dir_open(struct inode *, struct file *);
-static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
-static int smb_mkdir(struct inode *, struct dentry *, int);
-static int smb_rmdir(struct inode *, struct dentry *);
-static int smb_unlink(struct inode *, struct dentry *);
-static int smb_rename(struct inode *, struct dentry *,
-                      struct inode *, struct dentry *);
-static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
-static int smb_link(struct dentry *, struct inode *, struct dentry *);
-const struct file_operations smb_dir_operations =
-{
-        .llseek         = generic_file_llseek,
-        .read           = generic_read_dir,
-        .readdir        = smb_readdir,
-        .unlocked_ioctl = smb_ioctl,
-        .open           = smb_dir_open,
-};
-const struct inode_operations smb_dir_inode_operations =
-{
-        .create         = smb_create,
-        .lookup         = smb_lookup,
-        .unlink         = smb_unlink,
-        .mkdir          = smb_mkdir,
-        .rmdir          = smb_rmdir,
-        .rename         = smb_rename,
-        .getattr        = smb_getattr,
-        .setattr        = smb_notify_change,
-};
-const struct inode_operations smb_dir_inode_operations_unix =
-{
-        .create         = smb_create,
-        .lookup         = smb_lookup,
-        .unlink         = smb_unlink,
-        .mkdir          = smb_mkdir,
-        .rmdir          = smb_rmdir,
-        .rename         = smb_rename,
-        .getattr        = smb_getattr,
-        .setattr        = smb_notify_change,
-        .symlink        = smb_symlink,
-        .mknod          = smb_make_node,
-        .link           = smb_link,
-};
-/*
- * Read a directory, using filldir to fill the dirent memory.
- * smb_proc_readdir does the actual reading from the smb server.
- *
- * The cache code is almost directly taken from ncpfs
- */
-static int 
-smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *dir = dentry->d_inode;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        union  smb_dir_cache *cache = NULL;
-        struct smb_cache_control ctl;
-        struct page *page = NULL;
-        int result;
-        ctl.page  = NULL;
-        ctl.cache = NULL;
-        VERBOSE("reading %s/%s, f_pos=%d\n",
-                DENTRY_PATH(dentry),  (int) filp->f_pos);
-        result = 0;
-        lock_kernel();
-        switch ((unsigned int) filp->f_pos) {
-        case 0:
-                if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = 1;
-                /* fallthrough */
-        case 1:
-                if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = 2;
-        }
-        /*
-         * Make sure our inode is up-to-date.
-         */
-        result = smb_revalidate_inode(dentry);
-        if (result)
-                goto out;
-        page = grab_cache_page(&dir->i_data, 0);
-        if (!page)
-                goto read_really;
-        ctl.cache = cache = kmap(page);
-        ctl.head  = cache->head;
-        if (!PageUptodate(page) || !ctl.head.eof) {
-                VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
-                         DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
-                goto init_cache;
-        }
-        if (filp->f_pos == 2) {
-                if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
-                        goto init_cache;
-                /*
-                 * N.B. ncpfs checks mtime of dentry too here, we don't.
-                 *   1. common smb servers do not update mtime on dir changes
-                 *   2. it requires an extra smb request
-                 *      (revalidate has the same timeout as ctl.head.time)
-                 *
-                 * Instead smbfs invalidates its own cache on local changes
-                 * and remote changes are not seen until timeout.
-                 */
-        }
-        if (filp->f_pos > ctl.head.end)
-                goto finished;
-        ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
-        ctl.ofs  = ctl.fpos / SMB_DIRCACHE_SIZE;
-        ctl.idx  = ctl.fpos % SMB_DIRCACHE_SIZE;
-        for (;;) {
-                if (ctl.ofs != 0) {
-                        ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
-                        if (!ctl.page)
-                                goto invalid_cache;
-                        ctl.cache = kmap(ctl.page);
-                        if (!PageUptodate(ctl.page))
-                                goto invalid_cache;
-                }
-                while (ctl.idx < SMB_DIRCACHE_SIZE) {
-                        struct dentry *dent;
-                        int res;
-                        dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
-                                             dentry, filp->f_pos);
-                        if (!dent)
-                                goto invalid_cache;
-                        res = filldir(dirent, dent->d_name.name,
-                                      dent->d_name.len, filp->f_pos,
-                                      dent->d_inode->i_ino, DT_UNKNOWN);
-                        dput(dent);
-                        if (res)
-                                goto finished;
-                        filp->f_pos += 1;
-                        ctl.idx += 1;
-                        if (filp->f_pos > ctl.head.end)
-                                goto finished;
-                }
-                if (ctl.page) {
-                        kunmap(ctl.page);
-                        SetPageUptodate(ctl.page);
-                        unlock_page(ctl.page);
-                        page_cache_release(ctl.page);
-                        ctl.page = NULL;
-                }
-                ctl.idx  = 0;
-                ctl.ofs += 1;
-        }
-invalid_cache:
-        if (ctl.page) {
-                kunmap(ctl.page);
-                unlock_page(ctl.page);
-                page_cache_release(ctl.page);
-                ctl.page = NULL;
-        }
-        ctl.cache = cache;
-init_cache:
-        smb_invalidate_dircache_entries(dentry);
-        ctl.head.time = jiffies;
-        ctl.head.eof = 0;
-        ctl.fpos = 2;
-        ctl.ofs = 0;
-        ctl.idx = SMB_DIRCACHE_START;
-        ctl.filled = 0;
-        ctl.valid  = 1;
-read_really:
-        result = server->ops->readdir(filp, dirent, filldir, &ctl);
-        if (result == -ERESTARTSYS && page)
-                ClearPageUptodate(page);
-        if (ctl.idx == -1)
-                goto invalid_cache;     /* retry */
-        ctl.head.end = ctl.fpos - 1;
-        ctl.head.eof = ctl.valid;
-finished:
-        if (page) {
-                cache->head = ctl.head;
-                kunmap(page);
-                if (result != -ERESTARTSYS)
-                        SetPageUptodate(page);
-                unlock_page(page);
-                page_cache_release(page);
-        }
-        if (ctl.page) {
-                kunmap(ctl.page);
-                SetPageUptodate(ctl.page);
-                unlock_page(ctl.page);
-                page_cache_release(ctl.page);
-        }
-out:
-        unlock_kernel();
-        return result;
-}
-static int
-smb_dir_open(struct inode *dir, struct file *file)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct smb_sb_info *server;
-        int error = 0;
-        VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
-                file->f_path.dentry->d_name.name);
-        /*
-         * Directory timestamps in the core protocol aren't updated
-         * when a file is added, so we give them a very short TTL.
-         */
-        lock_kernel();
-        server = server_from_dentry(dentry);
-        if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
-                unsigned long age = jiffies - SMB_I(dir)->oldmtime;
-                if (age > 2*HZ)
-                        smb_invalid_dir_cache(dir);
-        }
-        /*
-         * Note: in order to allow the smbmount process to open the
-         * mount point, we only revalidate if the connection is valid or
-         * if the process is trying to access something other than the root.
-         */
-        if (server->state == CONN_VALID || !IS_ROOT(dentry))
-                error = smb_revalidate_inode(dentry);
-        unlock_kernel();
-        return error;
-}
-/*
- * Dentry operations routines
- */
-static int smb_lookup_validate(struct dentry *, struct nameidata *);
-static int smb_hash_dentry(struct dentry *, struct qstr *);
-static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
-static int smb_delete_dentry(struct dentry *);
-static const struct dentry_operations smbfs_dentry_operations =
-{
-        .d_revalidate   = smb_lookup_validate,
-        .d_hash         = smb_hash_dentry,
-        .d_compare      = smb_compare_dentry,
-        .d_delete       = smb_delete_dentry,
-};
-static const struct dentry_operations smbfs_dentry_operations_case =
-{
-        .d_revalidate   = smb_lookup_validate,
-        .d_delete       = smb_delete_dentry,
-};
-/*
- * This is the callback when the dcache has a lookup hit.
- */
-static int
-smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        struct inode * inode = dentry->d_inode;
-        unsigned long age = jiffies - dentry->d_time;
-        int valid;
-        /*
-         * The default validation is based on dentry age:
-         * we believe in dentries for a few seconds.  (But each
-         * successful server lookup renews the timestamp.)
-         */
-        valid = (age <= SMB_MAX_AGE(server));
-#ifdef SMBFS_DEBUG_VERBOSE
-        if (!valid)
-                VERBOSE("%s/%s not valid, age=%lu\n", 
-                        DENTRY_PATH(dentry), age);
-#endif
-        if (inode) {
-                lock_kernel();
-                if (is_bad_inode(inode)) {
-                        PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
-                        valid = 0;
-                } else if (!valid)
-                        valid = (smb_revalidate_inode(dentry) == 0);
-                unlock_kernel();
-        } else {
-                /*
-                 * What should we do for negative dentries?
-                 */
-        }
-        return valid;
-}
-static int 
-smb_hash_dentry(struct dentry *dir, struct qstr *this)
-{
-        unsigned long hash;
-        int i;
-        hash = init_name_hash();
-        for (i=0; i < this->len ; i++)
-                hash = partial_name_hash(tolower(this->name[i]), hash);
-        this->hash = end_name_hash(hash);
-  
-        return 0;
-}
-static int
-smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
-{
-        int i, result = 1;
-        if (a->len != b->len)
-                goto out;
-        for (i=0; i < a->len; i++) {
-                if (tolower(a->name[i]) != tolower(b->name[i]))
-                        goto out;
-        }
-        result = 0;
-out:
-        return result;
-}
-/*
- * This is the callback from dput() when d_count is going to 0.
- * We use this to unhash dentries with bad inodes.
- */
-static int
-smb_delete_dentry(struct dentry * dentry)
-{
-        if (dentry->d_inode) {
-                if (is_bad_inode(dentry->d_inode)) {
-                        PARANOIA("bad inode, unhashing %s/%s\n",
-                                 DENTRY_PATH(dentry));
-                        return 1;
-                }
-        } else {
-                /* N.B. Unhash negative dentries? */
-        }
-        return 0;
-}
-/*
- * Initialize a new dentry
- */
-void
-smb_new_dentry(struct dentry *dentry)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        if (server->mnt->flags & SMB_MOUNT_CASE)
-                dentry->d_op = &smbfs_dentry_operations_case;
-        else
-                dentry->d_op = &smbfs_dentry_operations;
-        dentry->d_time = jiffies;
-}
-/*
- * Whenever a lookup succeeds, we know the parent directories
- * are all valid, so we want to update the dentry timestamps.
- * N.B. Move this to dcache?
- */
-void
-smb_renew_times(struct dentry * dentry)
-{
-        dget(dentry);
-        spin_lock(&dentry->d_lock);
-        for (;;) {
-                struct dentry *parent;
-                dentry->d_time = jiffies;
-                if (IS_ROOT(dentry))
-                        break;
-                parent = dentry->d_parent;
-                dget(parent);
-                spin_unlock(&dentry->d_lock);
-                dput(dentry);
-                dentry = parent;
-                spin_lock(&dentry->d_lock);
-        }
-        spin_unlock(&dentry->d_lock);
-        dput(dentry);
-}
-static struct dentry *
-smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        struct smb_fattr finfo;
-        struct inode *inode;
-        int error;
-        struct smb_sb_info *server;
-        error = -ENAMETOOLONG;
-        if (dentry->d_name.len > SMB_MAXNAMELEN)
-                goto out;
-        /* Do not allow lookup of names with backslashes in */
-        error = -EINVAL;
-        if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
-                goto out;
-        lock_kernel();
-        error = smb_proc_getattr(dentry, &finfo);
-#ifdef SMBFS_PARANOIA
-        if (error && error != -ENOENT)
-                PARANOIA("find %s/%s failed, error=%d\n",
-                         DENTRY_PATH(dentry), error);
-#endif
-        inode = NULL;
-        if (error == -ENOENT)
-                goto add_entry;
-        if (!error) {
-                error = -EACCES;
-                finfo.f_ino = iunique(dentry->d_sb, 2);
-                inode = smb_iget(dir->i_sb, &finfo);
-                if (inode) {
-        add_entry:
-                        server = server_from_dentry(dentry);
-                        if (server->mnt->flags & SMB_MOUNT_CASE)
-                                dentry->d_op = &smbfs_dentry_operations_case;
-                        else
-                                dentry->d_op = &smbfs_dentry_operations;
-                        d_add(dentry, inode);
-                        smb_renew_times(dentry);
-                        error = 0;
-                }
-        }
-        unlock_kernel();
-out:
-        return ERR_PTR(error);
-}
-/*
- * This code is common to all routines creating a new inode.
- */
-static int
-smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        struct inode *inode;
-        int error;
-        struct smb_fattr fattr;
-        VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
-        error = smb_proc_getattr(dentry, &fattr);
-        if (error)
-                goto out_close;
-        smb_renew_times(dentry);
-        fattr.f_ino = iunique(dentry->d_sb, 2);
-        inode = smb_iget(dentry->d_sb, &fattr);
-        if (!inode)
-                goto out_no_inode;
-        if (have_id) {
-                struct smb_inode_info *ei = SMB_I(inode);
-                ei->fileid = fileid;
-                ei->access = SMB_O_RDWR;
-                ei->open = server->generation;
-        }
-        d_instantiate(dentry, inode);
-out:
-        return error;
-out_no_inode:
-        error = -EACCES;
-out_close:
-        if (have_id) {
-                PARANOIA("%s/%s failed, error=%d, closing %u\n",
-                         DENTRY_PATH(dentry), error, fileid);
-                smb_close_fileid(dentry, fileid);
-        }
-        goto out;
-}
-/* N.B. How should the mode argument be used? */
-static int
-smb_create(struct inode *dir, struct dentry *dentry, int mode,
-                struct nameidata *nd)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        __u16 fileid;
-        int error;
-        struct iattr attr;
-        VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
-        lock_kernel();
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
-        if (!error) {
-                if (server->opt.capabilities & SMB_CAP_UNIX) {
-                        /* Set attributes for new file */
-                        attr.ia_valid = ATTR_MODE;
-                        attr.ia_mode = mode;
-                        error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-                }
-                error = smb_instantiate(dentry, fileid, 1);
-        } else {
-                PARANOIA("%s/%s failed, error=%d\n",
-                         DENTRY_PATH(dentry), error);
-        }
-        unlock_kernel();
-        return error;
-}
-/* N.B. How should the mode argument be used? */
-static int
-smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int error;
-        struct iattr attr;
-        lock_kernel();
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_mkdir(dentry);
-        if (!error) {
-                if (server->opt.capabilities & SMB_CAP_UNIX) {
-                        /* Set attributes for new directory */
-                        attr.ia_valid = ATTR_MODE;
-                        attr.ia_mode = mode;
-                        error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-                }
-                error = smb_instantiate(dentry, 0, 0);
-        }
-        unlock_kernel();
-        return error;
-}
-static int
-smb_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        struct inode *inode = dentry->d_inode;
-        int error;
-        /*
-         * Close the directory if it's open.
-         */
-        lock_kernel();
-        smb_close(inode);
-        /*
-         * Check that nobody else is using the directory..
-         */
-        error = -EBUSY;
-        if (!d_unhashed(dentry))
-                goto out;
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_rmdir(dentry);
-out:
-        unlock_kernel();
-        return error;
-}
-static int
-smb_unlink(struct inode *dir, struct dentry *dentry)
-{
-        int error;
-        /*
-         * Close the file if it's open.
-         */
-        lock_kernel();
-        smb_close(dentry->d_inode);
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_unlink(dentry);
-        if (!error)
-                smb_renew_times(dentry);
-        unlock_kernel();
-        return error;
-}
-static int
-smb_rename(struct inode *old_dir, struct dentry *old_dentry,
-           struct inode *new_dir, struct dentry *new_dentry)
-{
-        int error;
-        /*
-         * Close any open files, and check whether to delete the
-         * target before attempting the rename.
-         */
-        lock_kernel();
-        if (old_dentry->d_inode)
-                smb_close(old_dentry->d_inode);
-        if (new_dentry->d_inode) {
-                smb_close(new_dentry->d_inode);
-                error = smb_proc_unlink(new_dentry);
-                if (error) {
-                        VERBOSE("unlink %s/%s, error=%d\n",
-                                DENTRY_PATH(new_dentry), error);
-                        goto out;
-                }
-                /* FIXME */
-                d_delete(new_dentry);
-        }
-        smb_invalid_dir_cache(old_dir);
-        smb_invalid_dir_cache(new_dir);
-        error = smb_proc_mv(old_dentry, new_dentry);
-        if (!error) {
-                smb_renew_times(old_dentry);
-                smb_renew_times(new_dentry);
-        }
-out:
-        unlock_kernel();
-        return error;
-}
-/*
- * FIXME: samba servers won't let you create device nodes unless uid/gid
- * matches the connection credentials (and we don't know which those are ...)
- */
-static int
-smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
-{
-        int error;
-        struct iattr attr;
-        attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
-        attr.ia_mode = mode;
-        current_euid_egid(&attr.ia_uid, &attr.ia_gid);
-        if (!new_valid_dev(dev))
-                return -EINVAL;
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
-        if (!error) {
-                error = smb_instantiate(dentry, 0, 0);
-        }
-        return error;
-}
-/*
- * dentry = existing file
- * new_dentry = new file
- */
-static int
-smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
-{
-        int error;
-        DEBUG1("smb_link old=%s/%s new=%s/%s\n",
-               DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
-        if (!error) {
-                smb_renew_times(dentry);
-                error = smb_instantiate(new_dentry, 0, 0);
-        }
-        return error;
-}
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
deleted file mode 100644
index 8e187a0f94bb..000000000000
--- a/fs/smbfs/file.c
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- *  file.c
- *
- *  Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/aio.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-#include "smb_debug.h"
-#include "proto.h"
-static int
-smb_fsync(struct file *file, int datasync)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int result;
-        VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
-        /*
-         * The VFS will writepage() all dirty pages for us, but we
-         * should send a SMBflush to the server, letting it know that
-         * we want things synchronized with actual storage.
-         *
-         * Note: this function requires all pages to have been written already
-         *       (should be ok with writepage_sync)
-         */
-        result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
-        return result;
-}
-/*
- * Read a page synchronously.
- */
-static int
-smb_readpage_sync(struct dentry *dentry, struct page *page)
-{
-        char *buffer = kmap(page);
-        loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        unsigned int rsize = smb_get_rsize(server);
-        int count = PAGE_SIZE;
-        int result;
-        VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
-                DENTRY_PATH(dentry), count, offset, rsize);
-        result = smb_open(dentry, SMB_O_RDONLY);
-        if (result < 0)
-                goto io_error;
-        do {
-                if (count < rsize)
-                        rsize = count;
-                result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
-                if (result < 0)
-                        goto io_error;
-                count -= result;
-                offset += result;
-                buffer += result;
-                dentry->d_inode->i_atime =
-                        current_fs_time(dentry->d_inode->i_sb);
-                if (result < rsize)
-                        break;
-        } while (count);
-        memset(buffer, 0, count);
-        flush_dcache_page(page);
-        SetPageUptodate(page);
-        result = 0;
-io_error:
-        kunmap(page);
-        unlock_page(page);
-        return result;
-}
-/*
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_readpage(struct file *file, struct page *page)
-{
-        int             error;
-        struct dentry  *dentry = file->f_path.dentry;
-        page_cache_get(page);
-        error = smb_readpage_sync(dentry, page);
-        page_cache_release(page);
-        return error;
-}
-/*
- * Write a page synchronously.
- * Offset is the data offset within the page.
- */
-static int
-smb_writepage_sync(struct inode *inode, struct page *page,
-                   unsigned long pageoffset, unsigned int count)
-{
-        loff_t offset;
-        char *buffer = kmap(page) + pageoffset;
-        struct smb_sb_info *server = server_from_inode(inode);
-        unsigned int wsize = smb_get_wsize(server);
-        int ret = 0;
-        offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
-        VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
-        do {
-                int write_ret;
-                if (count < wsize)
-                        wsize = count;
-                write_ret = server->ops->write(inode, offset, wsize, buffer);
-                if (write_ret < 0) {
-                        PARANOIA("failed write, wsize=%d, write_ret=%d\n",
-                                 wsize, write_ret);
-                        ret = write_ret;
-                        break;
-                }
-                /* N.B. what if result < wsize?? */
-#ifdef SMBFS_PARANOIA
-                if (write_ret < wsize)
-                        PARANOIA("short write, wsize=%d, write_ret=%d\n",
-                                 wsize, write_ret);
-#endif
-                buffer += wsize;
-                offset += wsize;
-                count -= wsize;
-                /*
-                 * Update the inode now rather than waiting for a refresh.
-                 */
-                inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
-                SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
-                if (offset > inode->i_size)
-                        inode->i_size = offset;
-        } while (count);
-        kunmap(page);
-        return ret;
-}
-/*
- * Write a page to the server. This will be used for NFS swapping only
- * (for now), and we currently do this synchronously only.
- *
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_writepage(struct page *page, struct writeback_control *wbc)
-{
-        struct address_space *mapping = page->mapping;
-        struct inode *inode;
-        unsigned long end_index;
-        unsigned offset = PAGE_CACHE_SIZE;
-        int err;
-        BUG_ON(!mapping);
-        inode = mapping->host;
-        BUG_ON(!inode);
-        end_index = inode->i_size >> PAGE_CACHE_SHIFT;
-        /* easy case */
-        if (page->index < end_index)
-                goto do_it;
-        /* things got complicated... */
-        offset = inode->i_size & (PAGE_CACHE_SIZE-1);
-        /* OK, are we completely out? */
-        if (page->index >= end_index+1 || !offset)
-                return 0; /* truncated - don't care */
-do_it:
-        page_cache_get(page);
-        err = smb_writepage_sync(inode, page, 0, offset);
-        SetPageUptodate(page);
-        unlock_page(page);
-        page_cache_release(page);
-        return err;
-}
-static int
-smb_updatepage(struct file *file, struct page *page, unsigned long offset,
-               unsigned int count)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
-                ((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
-        return smb_writepage_sync(dentry->d_inode, page, offset, count);
-}
-static ssize_t
-smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                        unsigned long nr_segs, loff_t pos)
-{
-        struct file * file = iocb->ki_filp;
-        struct dentry * dentry = file->f_path.dentry;
-        ssize_t status;
-        VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
-                (unsigned long) iocb->ki_left, (unsigned long) pos);
-        status = smb_revalidate_inode(dentry);
-        if (status) {
-                PARANOIA("%s/%s validation failed, error=%Zd\n",
-                         DENTRY_PATH(dentry), status);
-                goto out;
-        }
-        VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
-                (long)dentry->d_inode->i_size,
-                dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
-        status = generic_file_aio_read(iocb, iov, nr_segs, pos);
-out:
-        return status;
-}
-static int
-smb_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-        struct dentry * dentry = file->f_path.dentry;
-        int     status;
-        VERBOSE("file %s/%s, address %lu - %lu\n",
-                DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
-        status = smb_revalidate_inode(dentry);
-        if (status) {
-                PARANOIA("%s/%s validation failed, error=%d\n",
-                         DENTRY_PATH(dentry), status);
-                goto out;
-        }
-        status = generic_file_mmap(file, vma);
-out:
-        return status;
-}
-static ssize_t
-smb_file_splice_read(struct file *file, loff_t *ppos,
-                     struct pipe_inode_info *pipe, size_t count,
-                     unsigned int flags)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        ssize_t status;
-        VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
-                DENTRY_PATH(dentry), *ppos, count);
-        status = smb_revalidate_inode(dentry);
-        if (status) {
-                PARANOIA("%s/%s validation failed, error=%Zd\n",
-                         DENTRY_PATH(dentry), status);
-                goto out;
-        }
-        status = generic_file_splice_read(file, ppos, pipe, count, flags);
-out:
-        return status;
-}
-/*
- * This does the "real" work of the write. The generic routine has
- * allocated the page, locked it, done all the page alignment stuff
- * calculations etc. Now we should just copy the data from user
- * space and write it back to the real medium..
- *
- * If the writer ends up delaying the write, the writer needs to
- * increment the page use counts until he is done with the page.
- */
-static int smb_write_begin(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata)
-{
-        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = grab_cache_page_write_begin(mapping, index, flags);
-        if (!*pagep)
-                return -ENOMEM;
-        return 0;
-}
-static int smb_write_end(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned copied,
-                        struct page *page, void *fsdata)
-{
-        int status;
-        unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-        lock_kernel();
-        status = smb_updatepage(file, page, offset, copied);
-        unlock_kernel();
-        if (!status) {
-                if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
-                        SetPageUptodate(page);
-                status = copied;
-        }
-        unlock_page(page);
-        page_cache_release(page);
-        return status;
-}
-const struct address_space_operations smb_file_aops = {
-        .readpage = smb_readpage,
-        .writepage = smb_writepage,
-        .write_begin = smb_write_begin,
-        .write_end = smb_write_end,
-};
-/* 
- * Write to a file (through the page cache).
- */
-static ssize_t
-smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos)
-{
-        struct file * file = iocb->ki_filp;
-        struct dentry * dentry = file->f_path.dentry;
-        ssize_t result;
-        VERBOSE("file %s/%s, count=%lu@%lu\n",
-                DENTRY_PATH(dentry),
-                (unsigned long) iocb->ki_left, (unsigned long) pos);
-        result = smb_revalidate_inode(dentry);
-        if (result) {
-                PARANOIA("%s/%s validation failed, error=%Zd\n",
-                         DENTRY_PATH(dentry), result);
-                goto out;
-        }
-        result = smb_open(dentry, SMB_O_WRONLY);
-        if (result)
-                goto out;
-        if (iocb->ki_left > 0) {
-                result = generic_file_aio_write(iocb, iov, nr_segs, pos);
-                VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
-                        (long) file->f_pos, (long) dentry->d_inode->i_size,
-                        dentry->d_inode->i_mtime.tv_sec,
-                        dentry->d_inode->i_atime.tv_sec);
-        }
-out:
-        return result;
-}
-static int
-smb_file_open(struct inode *inode, struct file * file)
-{
-        int result;
-        struct dentry *dentry = file->f_path.dentry;
-        int smb_mode = (file->f_mode & O_ACCMODE) - 1;
-        lock_kernel();
-        result = smb_open(dentry, smb_mode);
-        if (result)
-                goto out;
-        SMB_I(inode)->openers++;
-out:
-        unlock_kernel();
-        return result;
-}
-static int
-smb_file_release(struct inode *inode, struct file * file)
-{
-        lock_kernel();
-        if (!--SMB_I(inode)->openers) {
-                /* We must flush any dirty pages now as we won't be able to
-                   write anything after close. mmap can trigger this.
-                   "openers" should perhaps include mmap'ers ... */
-                filemap_write_and_wait(inode->i_mapping);
-                smb_close(inode);
-        }
-        unlock_kernel();
-        return 0;
-}
-/*
- * Check whether the required access is compatible with
- * an inode's permission. SMB doesn't recognize superuser
- * privileges, so we need our own check for this.
- */
-static int
-smb_file_permission(struct inode *inode, int mask)
-{
-        int mode = inode->i_mode;
-        int error = 0;
-        VERBOSE("mode=%x, mask=%x\n", mode, mask);
-        /* Look at user permissions */
-        mode >>= 6;
-        if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
-                error = -EACCES;
-        return error;
-}
-static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
-{
-        loff_t ret;
-        lock_kernel();
-        ret = generic_file_llseek_unlocked(file, offset, origin);
-        unlock_kernel();
-        return ret;
-}
-const struct file_operations smb_file_operations =
-{
-        .llseek         = smb_remote_llseek,
-        .read           = do_sync_read,
-        .aio_read       = smb_file_aio_read,
-        .write          = do_sync_write,
-        .aio_write      = smb_file_aio_write,
-        .unlocked_ioctl = smb_ioctl,
-        .mmap           = smb_file_mmap,
-        .open           = smb_file_open,
-        .release        = smb_file_release,
-        .fsync          = smb_fsync,
-        .splice_read    = smb_file_splice_read,
-};
-const struct inode_operations smb_file_inode_operations =
-{
-        .permission     = smb_file_permission,
-        .getattr        = smb_getattr,
-        .setattr        = smb_notify_change,
-};
diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c
deleted file mode 100644
index 7ae0f5273ab1..000000000000
--- a/fs/smbfs/getopt.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * getopt.c
- */
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/net.h>
-#include "getopt.h"
-/**
- *      smb_getopt - option parser
- *      @caller: name of the caller, for error messages
- *      @options: the options string
- *      @opts: an array of &struct option entries controlling parser operations
- *      @optopt: output; will contain the current option
- *      @optarg: output; will contain the value (if one exists)
- *      @flag: output; may be NULL; should point to a long for or'ing flags
- *      @value: output; may be NULL; will be overwritten with the integer value
- *              of the current argument.
- *
- *      Helper to parse options on the format used by mount ("a=b,c=d,e,f").
- *      Returns opts->val if a matching entry in the 'opts' array is found,
- *      0 when no more tokens are found, -1 if an error is encountered.
- */
-int smb_getopt(char *caller, char **options, struct option *opts,
-               char **optopt, char **optarg, unsigned long *flag,
-               unsigned long *value)
-{
-        char *token;
-        char *val;
-        int i;
-        do {
-                if ((token = strsep(options, ",")) == NULL)
-                        return 0;
-        } while (*token == '\0');
-        *optopt = token;
-        *optarg = NULL;
-        if ((val = strchr (token, '=')) != NULL) {
-                *val++ = 0;
-                if (value)
-                        *value = simple_strtoul(val, NULL, 0);
-                *optarg = val;
-        }
-        for (i = 0; opts[i].name != NULL; i++) {
-                if (!strcmp(opts[i].name, token)) {
-                        if (!opts[i].flag && (!val || !*val)) {
-                                printk("%s: the %s option requires an argument\n",
-                                       caller, token);
-                                return -1;
-                        }
-                        if (flag && opts[i].flag)
-                                *flag |= opts[i].flag;
-                        return opts[i].val;
-                }
-        }
-        printk("%s: Unrecognized mount option %s\n", caller, token);
-        return -1;
-}
diff --git a/fs/smbfs/getopt.h b/fs/smbfs/getopt.h
deleted file mode 100644
index 146219ac7c46..000000000000
--- a/fs/smbfs/getopt.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _LINUX_GETOPT_H
-#define _LINUX_GETOPT_H
-struct option {
-        const char *name;
-        unsigned long flag;
-        int val;
-};
-extern int smb_getopt(char *caller, char **options, struct option *opts,
-                      char **optopt, char **optarg, unsigned long *flag,
-                      unsigned long *value);
-#endif /* _LINUX_GETOPT_H */
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
deleted file mode 100644
index 450c91941988..000000000000
--- a/fs/smbfs/inode.c
+++ /dev/null
@@ -1,839 +0,0 @@
-/*
- *  inode.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/smp_lock.h>
-#include <linux/nls.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/highuid.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include "smb_debug.h"
-#include "getopt.h"
-#include "proto.h"
-/* Always pick a default string */
-#ifdef CONFIG_SMB_NLS_REMOTE
-#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
-#else
-#define SMB_NLS_REMOTE ""
-#endif
-#define SMB_TTL_DEFAULT 1000
-static void smb_evict_inode(struct inode *);
-static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct dentry *, struct kstatfs *);
-static int  smb_show_options(struct seq_file *, struct vfsmount *);
-static struct kmem_cache *smb_inode_cachep;
-static struct inode *smb_alloc_inode(struct super_block *sb)
-{
-        struct smb_inode_info *ei;
-        ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
-        if (!ei)
-                return NULL;
-        return &ei->vfs_inode;
-}
-static void smb_destroy_inode(struct inode *inode)
-{
-        kmem_cache_free(smb_inode_cachep, SMB_I(inode));
-}
-static void init_once(void *foo)
-{
-        struct smb_inode_info *ei = (struct smb_inode_info *) foo;
-        inode_init_once(&ei->vfs_inode);
-}
-static int init_inodecache(void)
-{
-        smb_inode_cachep = kmem_cache_create("smb_inode_cache",
-                                             sizeof(struct smb_inode_info),
-                                             0, (SLAB_RECLAIM_ACCOUNT|
-                                                SLAB_MEM_SPREAD),
-                                             init_once);
-        if (smb_inode_cachep == NULL)
-                return -ENOMEM;
-        return 0;
-}
-static void destroy_inodecache(void)
-{
-        kmem_cache_destroy(smb_inode_cachep);
-}
-static int smb_remount(struct super_block *sb, int *flags, char *data)
-{
-        *flags |= MS_NODIRATIME;
-        return 0;
-}
-static const struct super_operations smb_sops =
-{
-        .alloc_inode    = smb_alloc_inode,
-        .destroy_inode  = smb_destroy_inode,
-        .drop_inode     = generic_delete_inode,
-        .evict_inode    = smb_evict_inode,
-        .put_super      = smb_put_super,
-        .statfs         = smb_statfs,
-        .show_options   = smb_show_options,
-        .remount_fs     = smb_remount,
-};
-/* We are always generating a new inode here */
-struct inode *
-smb_iget(struct super_block *sb, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = SMB_SB(sb);
-        struct inode *result;
-        DEBUG1("smb_iget: %p\n", fattr);
-        result = new_inode(sb);
-        if (!result)
-                return result;
-        result->i_ino = fattr->f_ino;
-        SMB_I(result)->open = 0;
-        SMB_I(result)->fileid = 0;
-        SMB_I(result)->access = 0;
-        SMB_I(result)->flags = 0;
-        SMB_I(result)->closed = 0;
-        SMB_I(result)->openers = 0;
-        smb_set_inode_attr(result, fattr);
-        if (S_ISREG(result->i_mode)) {
-                result->i_op = &smb_file_inode_operations;
-                result->i_fop = &smb_file_operations;
-                result->i_data.a_ops = &smb_file_aops;
-        } else if (S_ISDIR(result->i_mode)) {
-                if (server->opt.capabilities & SMB_CAP_UNIX)
-                        result->i_op = &smb_dir_inode_operations_unix;
-                else
-                        result->i_op = &smb_dir_inode_operations;
-                result->i_fop = &smb_dir_operations;
-        } else if (S_ISLNK(result->i_mode)) {
-                result->i_op = &smb_link_inode_operations;
-        } else {
-                init_special_inode(result, result->i_mode, fattr->f_rdev);
-        }
-        insert_inode_hash(result);
-        return result;
-}
-/*
- * Copy the inode data to a smb_fattr structure.
- */
-void
-smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-        memset(fattr, 0, sizeof(struct smb_fattr));
-        fattr->f_mode   = inode->i_mode;
-        fattr->f_nlink  = inode->i_nlink;
-        fattr->f_ino    = inode->i_ino;
-        fattr->f_uid    = inode->i_uid;
-        fattr->f_gid    = inode->i_gid;
-        fattr->f_size   = inode->i_size;
-        fattr->f_mtime  = inode->i_mtime;
-        fattr->f_ctime  = inode->i_ctime;
-        fattr->f_atime  = inode->i_atime;
-        fattr->f_blocks = inode->i_blocks;
-        fattr->attr     = SMB_I(inode)->attr;
-        /*
-         * Keep the attributes in sync with the inode permissions.
-         */
-        if (fattr->f_mode & S_IWUSR)
-                fattr->attr &= ~aRONLY;
-        else
-                fattr->attr |= aRONLY;
-}
-/*
- * Update the inode, possibly causing it to invalidate its pages if mtime/size
- * is different from last time.
- */
-void
-smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-        struct smb_inode_info *ei = SMB_I(inode);
-        /*
-         * A size change should have a different mtime, or same mtime
-         * but different size.
-         */
-        time_t last_time = inode->i_mtime.tv_sec;
-        loff_t last_sz = inode->i_size;
-        inode->i_mode   = fattr->f_mode;
-        inode->i_nlink  = fattr->f_nlink;
-        inode->i_uid    = fattr->f_uid;
-        inode->i_gid    = fattr->f_gid;
-        inode->i_ctime  = fattr->f_ctime;
-        inode->i_blocks = fattr->f_blocks;
-        inode->i_size   = fattr->f_size;
-        inode->i_mtime  = fattr->f_mtime;
-        inode->i_atime  = fattr->f_atime;
-        ei->attr = fattr->attr;
-        /*
-         * Update the "last time refreshed" field for revalidation.
-         */
-        ei->oldmtime = jiffies;
-        if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
-                VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
-                        inode->i_ino,
-                        (long) last_time, (long) inode->i_mtime.tv_sec,
-                        (long) last_sz, (long) inode->i_size);
-                if (!S_ISDIR(inode->i_mode))
-                        invalidate_remote_inode(inode);
-        }
-}
-/*
- * This is called if the connection has gone bad ...
- * try to kill off all the current inodes.
- */
-void
-smb_invalidate_inodes(struct smb_sb_info *server)
-{
-        VERBOSE("\n");
-        shrink_dcache_sb(SB_of(server));
-        invalidate_inodes(SB_of(server));
-}
-/*
- * This is called to update the inode attributes after
- * we've made changes to a file or directory.
- */
-static int
-smb_refresh_inode(struct dentry *dentry)
-{
-        struct inode *inode = dentry->d_inode;
-        int error;
-        struct smb_fattr fattr;
-        error = smb_proc_getattr(dentry, &fattr);
-        if (!error) {
-                smb_renew_times(dentry);
-                /*
-                 * Check whether the type part of the mode changed,
-                 * and don't update the attributes if it did.
-                 *
-                 * And don't dick with the root inode
-                 */
-                if (inode->i_ino == 2)
-                        return error;
-                if (S_ISLNK(inode->i_mode))
-                        return error;   /* VFS will deal with it */
-                if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
-                        smb_set_inode_attr(inode, &fattr);
-                } else {
-                        /*
-                         * Big trouble! The inode has become a new object,
-                         * so any operations attempted on it are invalid.
-                         *
-                         * To limit damage, mark the inode as bad so that
-                         * subsequent lookup validations will fail.
-                         */
-                        PARANOIA("%s/%s changed mode, %07o to %07o\n",
-                                 DENTRY_PATH(dentry),
-                                 inode->i_mode, fattr.f_mode);
-                        fattr.f_mode = inode->i_mode; /* save mode */
-                        make_bad_inode(inode);
-                        inode->i_mode = fattr.f_mode; /* restore mode */
-                        /*
-                         * No need to worry about unhashing the dentry: the
-                         * lookup validation will see that the inode is bad.
-                         * But we do want to invalidate the caches ...
-                         */
-                        if (!S_ISDIR(inode->i_mode))
-                                invalidate_remote_inode(inode);
-                        else
-                                smb_invalid_dir_cache(inode);
-                        error = -EIO;
-                }
-        }
-        return error;
-}
-/*
- * This is called when we want to check whether the inode
- * has changed on the server.  If it has changed, we must
- * invalidate our local caches.
- */
-int
-smb_revalidate_inode(struct dentry *dentry)
-{
-        struct smb_sb_info *s = server_from_dentry(dentry);
-        struct inode *inode = dentry->d_inode;
-        int error = 0;
-        DEBUG1("smb_revalidate_inode\n");
-        lock_kernel();
-        /*
-         * Check whether we've recently refreshed the inode.
-         */
-        if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
-                VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
-                        inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
-                goto out;
-        }
-        error = smb_refresh_inode(dentry);
-out:
-        unlock_kernel();
-        return error;
-}
-/*
- * This routine is called when i_nlink == 0 and i_count goes to 0.
- * All blocking cleanup operations need to go here to avoid races.
- */
-static void
-smb_evict_inode(struct inode *ino)
-{
-        DEBUG1("ino=%ld\n", ino->i_ino);
-        truncate_inode_pages(&ino->i_data, 0);
-        end_writeback(ino);
-        lock_kernel();
-        if (smb_close(ino))
-                PARANOIA("could not close inode %ld\n", ino->i_ino);
-        unlock_kernel();
-}
-static struct option opts[] = {
-        { "version",    0, 'v' },
-        { "win95",      SMB_MOUNT_WIN95, 1 },
-        { "oldattr",    SMB_MOUNT_OLDATTR, 1 },
-        { "dirattr",    SMB_MOUNT_DIRATTR, 1 },
-        { "case",       SMB_MOUNT_CASE, 1 },
-        { "uid",        0, 'u' },
-        { "gid",        0, 'g' },
-        { "file_mode",  0, 'f' },
-        { "dir_mode",   0, 'd' },
-        { "iocharset",  0, 'i' },
-        { "codepage",   0, 'c' },
-        { "ttl",        0, 't' },
-        { NULL,         0, 0}
-};
-static int
-parse_options(struct smb_mount_data_kernel *mnt, char *options)
-{
-        int c;
-        unsigned long flags;
-        unsigned long value;
-        char *optarg;
-        char *optopt;
-        flags = 0;
-        while ( (c = smb_getopt("smbfs", &options, opts,
-                                &optopt, &optarg, &flags, &value)) > 0) {
-                VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
-                switch (c) {
-                case 1:
-                        /* got a "flag" option */
-                        break;
-                case 'v':
-                        if (value != SMB_MOUNT_VERSION) {
-                        printk ("smbfs: Bad mount version %ld, expected %d\n",
-                                value, SMB_MOUNT_VERSION);
-                                return 0;
-                        }
-                        mnt->version = value;
-                        break;
-                case 'u':
-                        mnt->uid = value;
-                        flags |= SMB_MOUNT_UID;
-                        break;
-                case 'g':
-                        mnt->gid = value;
-                        flags |= SMB_MOUNT_GID;
-                        break;
-                case 'f':
-                        mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
-                        flags |= SMB_MOUNT_FMODE;
-                        break;
-                case 'd':
-                        mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
-                        flags |= SMB_MOUNT_DMODE;
-                        break;
-                case 'i':
-                        strlcpy(mnt->codepage.local_name, optarg, 
-                                SMB_NLS_MAXNAMELEN);
-                        break;
-                case 'c':
-                        strlcpy(mnt->codepage.remote_name, optarg,
-                                SMB_NLS_MAXNAMELEN);
-                        break;
-                case 't':
-                        mnt->ttl = value;
-                        break;
-                default:
-                        printk ("smbfs: Unrecognized mount option %s\n",
-                                optopt);
-                        return -1;
-                }
-        }
-        mnt->flags = flags;
-        return c;
-}
-/*
- * smb_show_options() is for displaying mount options in /proc/mounts.
- * It tries to avoid showing settings that were not changed from their
- * defaults.
- */
-static int
-smb_show_options(struct seq_file *s, struct vfsmount *m)
-{
-        struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
-        int i;
-        for (i = 0; opts[i].name != NULL; i++)
-                if (mnt->flags & opts[i].flag)
-                        seq_printf(s, ",%s", opts[i].name);
-        if (mnt->flags & SMB_MOUNT_UID)
-                seq_printf(s, ",uid=%d", mnt->uid);
-        if (mnt->flags & SMB_MOUNT_GID)
-                seq_printf(s, ",gid=%d", mnt->gid);
-        if (mnt->mounted_uid != 0)
-                seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
-        /* 
-         * Defaults for file_mode and dir_mode are unknown to us; they
-         * depend on the current umask of the user doing the mount.
-         */
-        if (mnt->flags & SMB_MOUNT_FMODE)
-                seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
-        if (mnt->flags & SMB_MOUNT_DMODE)
-                seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
-        if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
-                seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
-        if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
-                seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
-        if (mnt->ttl != SMB_TTL_DEFAULT)
-                seq_printf(s, ",ttl=%d", mnt->ttl);
-        return 0;
-}
-static void
-smb_unload_nls(struct smb_sb_info *server)
-{
-        unload_nls(server->remote_nls);
-        unload_nls(server->local_nls);
-}
-static void
-smb_put_super(struct super_block *sb)
-{
-        struct smb_sb_info *server = SMB_SB(sb);
-        lock_kernel();
-        smb_lock_server(server);
-        server->state = CONN_INVALID;
-        smbiod_unregister_server(server);
-        smb_close_socket(server);
-        if (server->conn_pid)
-                kill_pid(server->conn_pid, SIGTERM, 1);
-        bdi_destroy(&server->bdi);
-        kfree(server->ops);
-        smb_unload_nls(server);
-        sb->s_fs_info = NULL;
-        smb_unlock_server(server);
-        put_pid(server->conn_pid);
-        kfree(server);
-        unlock_kernel();
-}
-static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
-        struct smb_sb_info *server;
-        struct smb_mount_data_kernel *mnt;
-        struct smb_mount_data *oldmnt;
-        struct inode *root_inode;
-        struct smb_fattr root;
-        int ver;
-        void *mem;
-        static int warn_count;
-        if (warn_count < 5) {
-                warn_count++;
-                printk(KERN_EMERG "smbfs is deprecated and will be removed"
-                        " from the 2.6.27 kernel. Please migrate to cifs\n");
-        }
-        if (!raw_data)
-                goto out_no_data;
-        oldmnt = (struct smb_mount_data *) raw_data;
-        ver = oldmnt->version;
-        if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
-                goto out_wrong_data;
-        sb->s_flags |= MS_NODIRATIME;
-        sb->s_blocksize = 1024; /* Eh...  Is this correct? */
-        sb->s_blocksize_bits = 10;
-        sb->s_magic = SMB_SUPER_MAGIC;
-        sb->s_op = &smb_sops;
-        sb->s_time_gran = 100;
-        server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
-        if (!server)
-                goto out_no_server;
-        sb->s_fs_info = server;
-        
-        if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
-                goto out_bdi;
-        sb->s_bdi = &server->bdi;
-        server->super_block = sb;
-        server->mnt = NULL;
-        server->sock_file = NULL;
-        init_waitqueue_head(&server->conn_wq);
-        init_MUTEX(&server->sem);
-        INIT_LIST_HEAD(&server->entry);
-        INIT_LIST_HEAD(&server->xmitq);
-        INIT_LIST_HEAD(&server->recvq);
-        server->conn_error = 0;
-        server->conn_pid = NULL;
-        server->state = CONN_INVALID; /* no connection yet */
-        server->generation = 0;
-        /* Allocate the global temp buffer and some superblock helper structs */
-        /* FIXME: move these to the smb_sb_info struct */
-        VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
-                sizeof(struct smb_mount_data_kernel));
-        mem = kmalloc(sizeof(struct smb_ops) +
-                      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
-        if (!mem)
-                goto out_no_mem;
-        server->ops = mem;
-        smb_install_null_ops(server->ops);
-        server->mnt = mem + sizeof(struct smb_ops);
-        /* Setup NLS stuff */
-        server->remote_nls = NULL;
-        server->local_nls = NULL;
-        mnt = server->mnt;
-        memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
-        strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
-                SMB_NLS_MAXNAMELEN);
-        strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
-                SMB_NLS_MAXNAMELEN);
-        mnt->ttl = SMB_TTL_DEFAULT;
-        if (ver == SMB_MOUNT_OLDVERSION) {
-                mnt->version = oldmnt->version;
-                SET_UID(mnt->uid, oldmnt->uid);
-                SET_GID(mnt->gid, oldmnt->gid);
-                mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
-                mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-                mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
-                        SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
-        } else {
-                mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-                                S_IROTH | S_IXOTH | S_IFREG;
-                mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-                                S_IROTH | S_IXOTH | S_IFDIR;
-                if (parse_options(mnt, raw_data))
-                        goto out_bad_option;
-        }
-        mnt->mounted_uid = current_uid();
-        smb_setcodepage(server, &mnt->codepage);
-        /*
-         * Display the enabled options
-         * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
-         */
-        if (mnt->flags & SMB_MOUNT_OLDATTR)
-                printk("SMBFS: Using core getattr (Win 95 speedup)\n");
-        else if (mnt->flags & SMB_MOUNT_DIRATTR)
-                printk("SMBFS: Using dir ff getattr\n");
-        if (smbiod_register_server(server) < 0) {
-                printk(KERN_ERR "smbfs: failed to start smbiod\n");
-                goto out_no_smbiod;
-        }
-        /*
-         * Keep the super block locked while we get the root inode.
-         */
-        smb_init_root_dirent(server, &root, sb);
-        root_inode = smb_iget(sb, &root);
-        if (!root_inode)
-                goto out_no_root;
-        sb->s_root = d_alloc_root(root_inode);
-        if (!sb->s_root)
-                goto out_no_root;
-        smb_new_dentry(sb->s_root);
-        return 0;
-out_no_root:
-        iput(root_inode);
-out_no_smbiod:
-        smb_unload_nls(server);
-out_bad_option:
-        kfree(mem);
-out_no_mem:
-        bdi_destroy(&server->bdi);
-out_bdi:
-        if (!server->mnt)
-                printk(KERN_ERR "smb_fill_super: allocation failure\n");
-        sb->s_fs_info = NULL;
-        kfree(server);
-        goto out_fail;
-out_wrong_data:
-        printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
-        goto out_fail;
-out_no_data:
-        printk(KERN_ERR "smb_fill_super: missing data argument\n");
-out_fail:
-        return -EINVAL;
-out_no_server:
-        printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
-        return -ENOMEM;
-}
-static int
-smb_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        int result;
-        
-        lock_kernel();
-        result = smb_proc_dskattr(dentry, buf);
-        unlock_kernel();
-        buf->f_type = SMB_SUPER_MAGIC;
-        buf->f_namelen = SMB_MAXPATHLEN;
-        return result;
-}
-int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
-{
-        int err = smb_revalidate_inode(dentry);
-        if (!err)
-                generic_fillattr(dentry->d_inode, stat);
-        return err;
-}
-int
-smb_notify_change(struct dentry *dentry, struct iattr *attr)
-{
-        struct inode *inode = dentry->d_inode;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
-        int error, changed, refresh = 0;
-        struct smb_fattr fattr;
-        lock_kernel();
-        error = smb_revalidate_inode(dentry);
-        if (error)
-                goto out;
-        if ((error = inode_change_ok(inode, attr)) < 0)
-                goto out;
-        error = -EPERM;
-        if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
-                goto out;
-        if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
-                goto out;
-        if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
-                goto out;
-        if ((attr->ia_valid & ATTR_SIZE) != 0) {
-                VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
-                        DENTRY_PATH(dentry),
-                        (long) inode->i_size, (long) attr->ia_size);
-                filemap_write_and_wait(inode->i_mapping);
-                error = smb_open(dentry, O_WRONLY);
-                if (error)
-                        goto out;
-                error = server->ops->truncate(inode, attr->ia_size);
-                if (error)
-                        goto out;
-                truncate_setsize(inode, attr->ia_size);
-                refresh = 1;
-        }
-        if (server->opt.capabilities & SMB_CAP_UNIX) {
-                /* For now we don't want to set the size with setattr_unix */
-                attr->ia_valid &= ~ATTR_SIZE;
-                /* FIXME: only call if we actually want to set something? */
-                error = smb_proc_setattr_unix(dentry, attr, 0, 0);
-                if (!error)
-                        refresh = 1;
-                goto out;
-        }
-        /*
-         * Initialize the fattr and check for changed fields.
-         * Note: CTIME under SMB is creation time rather than
-         * change time, so we don't attempt to change it.
-         */
-        smb_get_inode_attr(inode, &fattr);
-        changed = 0;
-        if ((attr->ia_valid & ATTR_MTIME) != 0) {
-                fattr.f_mtime = attr->ia_mtime;
-                changed = 1;
-        }
-        if ((attr->ia_valid & ATTR_ATIME) != 0) {
-                fattr.f_atime = attr->ia_atime;
-                /* Earlier protocols don't have an access time */
-                if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
-                        changed = 1;
-        }
-        if (changed) {
-                error = smb_proc_settime(dentry, &fattr);
-                if (error)
-                        goto out;
-                refresh = 1;
-        }
-        /*
-         * Check for mode changes ... we're extremely limited in
-         * what can be set for SMB servers: just the read-only bit.
-         */
-        if ((attr->ia_valid & ATTR_MODE) != 0) {
-                VERBOSE("%s/%s mode change, old=%x, new=%x\n",
-                        DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
-                changed = 0;
-                if (attr->ia_mode & S_IWUSR) {
-                        if (fattr.attr & aRONLY) {
-                                fattr.attr &= ~aRONLY;
-                                changed = 1;
-                        }
-                } else {
-                        if (!(fattr.attr & aRONLY)) {
-                                fattr.attr |= aRONLY;
-                                changed = 1;
-                        }
-                }
-                if (changed) {
-                        error = smb_proc_setattr(dentry, &fattr);
-                        if (error)
-                                goto out;
-                        refresh = 1;
-                }
-        }
-        error = 0;
-out:
-        if (refresh)
-                smb_refresh_inode(dentry);
-        unlock_kernel();
-        return error;
-}
-static int smb_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
-}
-static struct file_system_type smb_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "smbfs",
-        .get_sb         = smb_get_sb,
-        .kill_sb        = kill_anon_super,
-        .fs_flags       = FS_BINARY_MOUNTDATA,
-};
-static int __init init_smb_fs(void)
-{
-        int err;
-        DEBUG1("registering ...\n");
-        err = init_inodecache();
-        if (err)
-                goto out_inode;
-        err = smb_init_request_cache();
-        if (err)
-                goto out_request;
-        err = register_filesystem(&smb_fs_type);
-        if (err)
-                goto out;
-        return 0;
-out:
-        smb_destroy_request_cache();
-out_request:
-        destroy_inodecache();
-out_inode:
-        return err;
-}
-static void __exit exit_smb_fs(void)
-{
-        DEBUG1("unregistering ...\n");
-        unregister_filesystem(&smb_fs_type);
-        smb_destroy_request_cache();
-        destroy_inodecache();
-}
-module_init(init_smb_fs)
-module_exit(exit_smb_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
deleted file mode 100644
index 07215312ad39..000000000000
--- a/fs/smbfs/ioctl.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  ioctl.c
- *
- *  Copyright (C) 1995, 1996 by Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/time.h>
-#include <linux/mm.h>
-#include <linux/highuid.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-#include <asm/uaccess.h>
-#include "proto.h"
-long
-smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-        struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
-        struct smb_conn_opt opt;
-        int result = -EINVAL;
-        lock_kernel();
-        switch (cmd) {
-                uid16_t uid16;
-                uid_t uid32;
-        case SMB_IOC_GETMOUNTUID:
-                SET_UID(uid16, server->mnt->mounted_uid);
-                result = put_user(uid16, (uid16_t __user *) arg);
-                break;
-        case SMB_IOC_GETMOUNTUID32:
-                SET_UID(uid32, server->mnt->mounted_uid);
-                result = put_user(uid32, (uid_t __user *) arg);
-                break;
-        case SMB_IOC_NEWCONN:
-                /* arg is smb_conn_opt, or NULL if no connection was made */
-                if (!arg) {
-                        result = 0;
-                        smb_lock_server(server);
-                        server->state = CONN_RETRIED;
-                        printk(KERN_ERR "Connection attempt failed!  [%d]\n",
-                               server->conn_error);
-                        smbiod_flush(server);
-                        smb_unlock_server(server);
-                        break;
-                }
-                result = -EFAULT;
-                if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
-                        result = smb_newconn(server, &opt);
-                break;
-        default:
-                break;
-        }
-        unlock_kernel();
-        return result;
-}
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
deleted file mode 100644
index 71c29b6670b4..000000000000
--- a/fs/smbfs/proc.c
+++ /dev/null
@@ -1,3507 +0,0 @@
-/*
- *  proc.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/dcache.h>
-#include <linux/nls.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include <net/sock.h>
-#include <asm/string.h>
-#include <asm/div64.h>
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-/* Features. Undefine if they cause problems, this should perhaps be a
-   config option. */
-#define SMBFS_POSIX_UNLINK 1
-/* Allow smb_retry to be interrupted. */
-#define SMB_RETRY_INTR
-#define SMB_VWV(packet)  ((packet) + SMB_HEADER_LEN)
-#define SMB_CMD(packet)  (*(packet+8))
-#define SMB_WCT(packet)  (*(packet+SMB_HEADER_LEN - 1))
-#define SMB_DIRINFO_SIZE 43
-#define SMB_STATUS_SIZE  21
-#define SMB_ST_BLKSIZE  (PAGE_SIZE)
-#define SMB_ST_BLKSHIFT (PAGE_SHIFT)
-static struct smb_ops smb_ops_core;
-static struct smb_ops smb_ops_os2;
-static struct smb_ops smb_ops_win95;
-static struct smb_ops smb_ops_winNT;
-static struct smb_ops smb_ops_unix;
-static struct smb_ops smb_ops_null;
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-                      struct smb_fattr *fattr);
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-                    struct smb_fattr *fattr);
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-                      u16 attr);
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-                     struct inode *inode, struct smb_fattr *fattr);
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server);
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src);
-static void
-str_upper(char *name, int len)
-{
-        while (len--)
-        {
-                if (*name >= 'a' && *name <= 'z')
-                        *name -= ('a' - 'A');
-                name++;
-        }
-}
-#if 0
-static void
-str_lower(char *name, int len)
-{
-        while (len--)
-        {
-                if (*name >= 'A' && *name <= 'Z')
-                        *name += ('a' - 'A');
-                name++;
-        }
-}
-#endif
-/* reverse a string inline. This is used by the dircache walking routines */
-static void reverse_string(char *buf, int len)
-{
-        char c;
-        char *end = buf+len-1;
-        while(buf < end) {
-                c = *buf;
-                *(buf++) = *end;
-                *(end--) = c;
-        }
-}
-/* no conversion, just a wrapper for memcpy. */
-static int convert_memcpy(unsigned char *output, int olen,
-                          const unsigned char *input, int ilen,
-                          struct nls_table *nls_from,
-                          struct nls_table *nls_to)
-{
-        if (olen < ilen)
-                return -ENAMETOOLONG;
-        memcpy(output, input, ilen);
-        return ilen;
-}
-static inline int write_char(unsigned char ch, char *output, int olen)
-{
-        if (olen < 4)
-                return -ENAMETOOLONG;
-        sprintf(output, ":x%02x", ch);
-        return 4;
-}
-static inline int write_unichar(wchar_t ch, char *output, int olen)
-{
-        if (olen < 5)
-                return -ENAMETOOLONG;
-        sprintf(output, ":%04x", ch);
-        return 5;
-}
-/* convert from one "codepage" to another (possibly being utf8). */
-static int convert_cp(unsigned char *output, int olen,
-                      const unsigned char *input, int ilen,
-                      struct nls_table *nls_from,
-                      struct nls_table *nls_to)
-{
-        int len = 0;
-        int n;
-        wchar_t ch;
-        while (ilen > 0) {
-                /* convert by changing to unicode and back to the new cp */
-                n = nls_from->char2uni(input, ilen, &ch);
-                if (n == -EINVAL) {
-                        ilen--;
-                        n = write_char(*input++, output, olen);
-                        if (n < 0)
-                                goto fail;
-                        output += n;
-                        olen -= n;
-                        len += n;
-                        continue;
-                } else if (n < 0)
-                        goto fail;
-                input += n;
-                ilen -= n;
-                n = nls_to->uni2char(ch, output, olen);
-                if (n == -EINVAL)
-                        n = write_unichar(ch, output, olen);
-                if (n < 0)
-                        goto fail;
-                output += n;
-                olen -= n;
-                len += n;
-        }
-        return len;
-fail:
-        return n;
-}
-/* ----------------------------------------------------------- */
-/*
- * nls_unicode
- *
- * This encodes/decodes little endian unicode format
- */
-static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
-{
-        if (boundlen < 2)
-                return -EINVAL;
-        *out++ = uni & 0xff;
-        *out++ = uni >> 8;
-        return 2;
-}
-static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
-{
-        if (boundlen < 2)
-                return -EINVAL;
-        *uni = (rawstring[1] << 8) | rawstring[0];
-        return 2;
-}
-static struct nls_table unicode_table = {
-        .charset        = "unicode",
-        .uni2char       = uni2char,
-        .char2uni       = char2uni,
-};
-/* ----------------------------------------------------------- */
-static int setcodepage(struct nls_table **p, char *name)
-{
-        struct nls_table *nls;
-        if (!name || !*name) {
-                nls = NULL;
-        } else if ( (nls = load_nls(name)) == NULL) {
-                printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
-                return -EINVAL;
-        }
-        /* if already set, unload the previous one. */
-        if (*p && *p != &unicode_table)
-                unload_nls(*p);
-        *p = nls;
-        return 0;
-}
-/* Handles all changes to codepage settings. */
-int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
-{
-        int n = 0;
-        smb_lock_server(server);
-        /* Don't load any nls_* at all, if no remote is requested */
-        if (!*cp->remote_name)
-                goto out;
-        /* local */
-        n = setcodepage(&server->local_nls, cp->local_name);
-        if (n != 0)
-                goto out;
-        /* remote */
-        if (!strcmp(cp->remote_name, "unicode")) {
-                server->remote_nls = &unicode_table;
-        } else {
-                n = setcodepage(&server->remote_nls, cp->remote_name);
-                if (n != 0)
-                        setcodepage(&server->local_nls, NULL);
-        }
-out:
-        if (server->local_nls != NULL && server->remote_nls != NULL)
-                server->ops->convert = convert_cp;
-        else
-                server->ops->convert = convert_memcpy;
-        smb_unlock_server(server);
-        return n;
-}
-/*****************************************************************************/
-/*                                                                           */
-/*  Encoding/Decoding section                                                */
-/*                                                                           */
-/*****************************************************************************/
-static __u8 *
-smb_encode_smb_length(__u8 * p, __u32 len)
-{
-        *p = 0;
-        *(p+1) = 0;
-        *(p+2) = (len & 0xFF00) >> 8;
-        *(p+3) = (len & 0xFF);
-        if (len > 0xFFFF)
-        {
-                *(p+1) = 1;
-        }
-        return p + 4;
-}
-/*
- * smb_build_path: build the path to entry and name storing it in buf.
- * The path returned will have the trailing '\0'.
- */
-static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
-                          int maxlen,
-                          struct dentry *entry, struct qstr *name)
-{
-        unsigned char *path = buf;
-        int len;
-        int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
-        if (maxlen < (2<<unicode))
-                return -ENAMETOOLONG;
-        if (maxlen > SMB_MAXPATHLEN + 1)
-                maxlen = SMB_MAXPATHLEN + 1;
-        if (entry == NULL)
-                goto test_name_and_out;
-        /*
-         * If IS_ROOT, we have to do no walking at all.
-         */
-        if (IS_ROOT(entry) && !name) {
-                *path++ = '\\';
-                if (unicode) *path++ = '\0';
-                *path++ = '\0';
-                if (unicode) *path++ = '\0';
-                return path-buf;
-        }
-        /*
-         * Build the path string walking the tree backward from end to ROOT
-         * and store it in reversed order [see reverse_string()]
-         */
-        dget(entry);
-        spin_lock(&entry->d_lock);
-        while (!IS_ROOT(entry)) {
-                struct dentry *parent;
-                if (maxlen < (3<<unicode)) {
-                        spin_unlock(&entry->d_lock);
-                        dput(entry);
-                        return -ENAMETOOLONG;
-                }
-                len = server->ops->convert(path, maxlen-2, 
-                                      entry->d_name.name, entry->d_name.len,
-                                      server->local_nls, server->remote_nls);
-                if (len < 0) {
-                        spin_unlock(&entry->d_lock);
-                        dput(entry);
-                        return len;
-                }
-                reverse_string(path, len);
-                path += len;
-                if (unicode) {
-                        /* Note: reverse order */
-                        *path++ = '\0';
-                        maxlen--;
-                }
-                *path++ = '\\';
-                maxlen -= len+1;
-                parent = entry->d_parent;
-                dget(parent);
-                spin_unlock(&entry->d_lock);
-                dput(entry);
-                entry = parent;
-                spin_lock(&entry->d_lock);
-        }
-        spin_unlock(&entry->d_lock);
-        dput(entry);
-        reverse_string(buf, path-buf);
-        /* maxlen has space for at least one char */
-test_name_and_out:
-        if (name) {
-                if (maxlen < (3<<unicode))
-                        return -ENAMETOOLONG;
-                *path++ = '\\';
-                if (unicode) {
-                        *path++ = '\0';
-                        maxlen--;
-                }
-                len = server->ops->convert(path, maxlen-2, 
-                                      name->name, name->len,
-                                      server->local_nls, server->remote_nls);
-                if (len < 0)
-                        return len;
-                path += len;
-                maxlen -= len+1;
-        }
-        /* maxlen has space for at least one char */
-        *path++ = '\0';
-        if (unicode) *path++ = '\0';
-        return path-buf;
-}
-static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
-                           struct dentry *dir, struct qstr *name)
-{
-        int result;
-        result = smb_build_path(server, buf, maxlen, dir, name);
-        if (result < 0)
-                goto out;
-        if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
-                str_upper(buf, result);
-out:
-        return result;
-}
-/* encode_path for non-trans2 request SMBs */
-static int smb_simple_encode_path(struct smb_request *req, char **p,
-                                  struct dentry * entry, struct qstr * name)
-{
-        struct smb_sb_info *server = req->rq_server;
-        char *s = *p;
-        int res;
-        int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
-        int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-        if (!maxlen)
-                return -ENAMETOOLONG;
-        *s++ = 4;       /* ASCII data format */
-        /*
-         * SMB Unicode strings must be 16bit aligned relative the start of the
-         * packet. If they are not they must be padded with 0.
-         */
-        if (unicode) {
-                int align = s - (char *)req->rq_buffer;
-                if (!(align & 1)) {
-                        *s++ = '\0';
-                        maxlen--;
-                }
-        }
-        res = smb_encode_path(server, s, maxlen-1, entry, name);
-        if (res < 0)
-                return res;
-        *p = s + res;
-        return 0;
-}
-/* The following are taken directly from msdos-fs */
-/* Linear day numbers of the respective 1sts in non-leap years. */
-static int day_n[] =
-{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
-                  /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
-static time_t
-utc2local(struct smb_sb_info *server, time_t time)
-{
-        return time - server->opt.serverzone*60;
-}
-static time_t
-local2utc(struct smb_sb_info *server, time_t time)
-{
-        return time + server->opt.serverzone*60;
-}
-/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-static time_t
-date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
-{
-        int month, year;
-        time_t secs;
-        /* first subtract and mask after that... Otherwise, if
-           date == 0, bad things happen */
-        month = ((date >> 5) - 1) & 15;
-        year = date >> 9;
-        secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
-            ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
-                                                   month < 2 ? 1 : 0) + 3653);
-        /* days since 1.1.70 plus 80's leap day */
-        return local2utc(server, secs);
-}
-/* Convert linear UNIX date to a MS-DOS time/date pair. */
-static void
-date_unix2dos(struct smb_sb_info *server,
-              int unix_date, __u16 *date, __u16 *time)
-{
-        int day, year, nl_day, month;
-        unix_date = utc2local(server, unix_date);
-        if (unix_date < 315532800)
-                unix_date = 315532800;
-        *time = (unix_date % 60) / 2 +
-                (((unix_date / 60) % 60) << 5) +
-                (((unix_date / 3600) % 24) << 11);
-        day = unix_date / 86400 - 3652;
-        year = day / 365;
-        if ((year + 3) / 4 + 365 * year > day)
-                year--;
-        day -= (year + 3) / 4 + 365 * year;
-        if (day == 59 && !(year & 3)) {
-                nl_day = day;
-                month = 2;
-        } else {
-                nl_day = (year & 3) || day <= 59 ? day : day - 1;
-                for (month = 1; month < 12; month++)
-                        if (day_n[month] > nl_day)
-                                break;
-        }
-        *date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
-}
-/* The following are taken from fs/ntfs/util.c */
-#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
-/*
- * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
- * into Unix UTC (based 1970-01-01, in seconds).
- */
-static struct timespec
-smb_ntutc2unixutc(u64 ntutc)
-{
-        struct timespec ts;
-        /* FIXME: what about the timezone difference? */
-        /* Subtract the NTFS time offset, then convert to 1s intervals. */
-        u64 t = ntutc - NTFS_TIME_OFFSET;
-        ts.tv_nsec = do_div(t, 10000000) * 100;
-        ts.tv_sec = t; 
-        return ts;
-}
-/* Convert the Unix UTC into NT time */
-static u64
-smb_unixutc2ntutc(struct timespec ts)
-{
-        /* Note: timezone conversion is probably wrong. */
-        /* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
-        return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
-}
-#define MAX_FILE_MODE   6
-static mode_t file_mode[] = {
-        S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
-};
-static int smb_filetype_to_mode(u32 filetype)
-{
-        if (filetype > MAX_FILE_MODE) {
-                PARANOIA("Filetype out of range: %d\n", filetype);
-                return S_IFREG;
-        }
-        return file_mode[filetype];
-}
-static u32 smb_filetype_from_mode(int mode)
-{
-        if (S_ISREG(mode))
-                return UNIX_TYPE_FILE;
-        if (S_ISDIR(mode))
-                return UNIX_TYPE_DIR;
-        if (S_ISLNK(mode))
-                return UNIX_TYPE_SYMLINK;
-        if (S_ISCHR(mode))
-                return UNIX_TYPE_CHARDEV;
-        if (S_ISBLK(mode))
-                return UNIX_TYPE_BLKDEV;
-        if (S_ISFIFO(mode))
-                return UNIX_TYPE_FIFO;
-        if (S_ISSOCK(mode))
-                return UNIX_TYPE_SOCKET;
-        return UNIX_TYPE_UNKNOWN;
-}
-/*****************************************************************************/
-/*                                                                           */
-/*  Support section.                                                         */
-/*                                                                           */
-/*****************************************************************************/
-__u32
-smb_len(__u8 * p)
-{
-        return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
-}
-static __u16
-smb_bcc(__u8 * packet)
-{
-        int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
-        return WVAL(packet, pos);
-}
-/* smb_valid_packet: We check if packet fulfills the basic
-   requirements of a smb packet */
-static int
-smb_valid_packet(__u8 * packet)
-{
-        return (packet[4] == 0xff
-                && packet[5] == 'S'
-                && packet[6] == 'M'
-                && packet[7] == 'B'
-                && (smb_len(packet) + 4 == SMB_HEADER_LEN
-                    + SMB_WCT(packet) * 2 + smb_bcc(packet)));
-}
-/* smb_verify: We check if we got the answer we expected, and if we
-   got enough data. If bcc == -1, we don't care. */
-static int
-smb_verify(__u8 * packet, int command, int wct, int bcc)
-{
-        if (SMB_CMD(packet) != command)
-                goto bad_command;
-        if (SMB_WCT(packet) < wct)
-                goto bad_wct;
-        if (bcc != -1 && smb_bcc(packet) < bcc)
-                goto bad_bcc;
-        return 0;
-bad_command:
-        printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
-               command, SMB_CMD(packet));
-        goto fail;
-bad_wct:
-        printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
-               command, wct, SMB_WCT(packet));
-        goto fail;
-bad_bcc:
-        printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
-               command, bcc, smb_bcc(packet));
-fail:
-        return -EIO;
-}
-/*
- * Returns the maximum read or write size for the "payload". Making all of the
- * packet fit within the negotiated max_xmit size.
- *
- * N.B. Since this value is usually computed before locking the server,
- * the server's packet size must never be decreased!
- */
-static inline int
-smb_get_xmitsize(struct smb_sb_info *server, int overhead)
-{
-        return server->opt.max_xmit - overhead;
-}
-/*
- * Calculate the maximum read size
- */
-int
-smb_get_rsize(struct smb_sb_info *server)
-{
-        /* readX has 12 parameters, read has 5 */
-        int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
-        int size = smb_get_xmitsize(server, overhead);
-        VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-        return size;
-}
-/*
- * Calculate the maximum write size
- */
-int
-smb_get_wsize(struct smb_sb_info *server)
-{
-        /* writeX has 14 parameters, write has 5 */
-        int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
-        int size = smb_get_xmitsize(server, overhead);
-        VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-        return size;
-}
-/*
- * Convert SMB error codes to -E... errno values.
- */
-int
-smb_errno(struct smb_request *req)
-{
-        int errcls = req->rq_rcls;
-        int error  = req->rq_err;
-        char *class = "Unknown";
-        VERBOSE("errcls %d  code %d  from command 0x%x\n",
-                errcls, error, SMB_CMD(req->rq_header));
-        if (errcls == ERRDOS) {
-                switch (error) {
-                case ERRbadfunc:
-                        return -EINVAL;
-                case ERRbadfile:
-                case ERRbadpath:
-                        return -ENOENT;
-                case ERRnofids:
-                        return -EMFILE;
-                case ERRnoaccess:
-                        return -EACCES;
-                case ERRbadfid:
-                        return -EBADF;
-                case ERRbadmcb:
-                        return -EREMOTEIO;
-                case ERRnomem:
-                        return -ENOMEM;
-                case ERRbadmem:
-                        return -EFAULT;
-                case ERRbadenv:
-                case ERRbadformat:
-                        return -EREMOTEIO;
-                case ERRbadaccess:
-                        return -EACCES;
-                case ERRbaddata:
-                        return -E2BIG;
-                case ERRbaddrive:
-                        return -ENXIO;
-                case ERRremcd:
-                        return -EREMOTEIO;
-                case ERRdiffdevice:
-                        return -EXDEV;
-                case ERRnofiles:
-                        return -ENOENT;
-                case ERRbadshare:
-                        return -ETXTBSY;
-                case ERRlock:
-                        return -EDEADLK;
-                case ERRfilexists:
-                        return -EEXIST;
-                case ERROR_INVALID_PARAMETER:
-                        return -EINVAL;
-                case ERROR_DISK_FULL:
-                        return -ENOSPC;
-                case ERROR_INVALID_NAME:
-                        return -ENOENT;
-                case ERROR_DIR_NOT_EMPTY:
-                        return -ENOTEMPTY;
-                case ERROR_NOT_LOCKED:
-                       return -ENOLCK;
-                case ERROR_ALREADY_EXISTS:
-                        return -EEXIST;
-                default:
-                        class = "ERRDOS";
-                        goto err_unknown;
-                }
-        } else if (errcls == ERRSRV) {
-                switch (error) {
-                /* N.B. This is wrong ... EIO ? */
-                case ERRerror:
-                        return -ENFILE;
-                case ERRbadpw:
-                        return -EINVAL;
-                case ERRbadtype:
-                case ERRtimeout:
-                        return -EIO;
-                case ERRaccess:
-                        return -EACCES;
-                /*
-                 * This is a fatal error, as it means the "tree ID"
-                 * for this connection is no longer valid. We map
-                 * to a special error code and get a new connection.
-                 */
-                case ERRinvnid:
-                        return -EBADSLT;
-                default:
-                        class = "ERRSRV";
-                        goto err_unknown;
-                }
-        } else if (errcls == ERRHRD) {
-                switch (error) {
-                case ERRnowrite:
-                        return -EROFS;
-                case ERRbadunit:
-                        return -ENODEV;
-                case ERRnotready:
-                        return -EUCLEAN;
-                case ERRbadcmd:
-                case ERRdata:
-                        return -EIO;
-                case ERRbadreq:
-                        return -ERANGE;
-                case ERRbadshare:
-                        return -ETXTBSY;
-                case ERRlock:
-                        return -EDEADLK;
-                case ERRdiskfull:
-                        return -ENOSPC;
-                default:
-                        class = "ERRHRD";
-                        goto err_unknown;
-                }
-        } else if (errcls == ERRCMD) {
-                class = "ERRCMD";
-        } else if (errcls == SUCCESS) {
-                return 0;       /* This is the only valid 0 return */
-        }
-err_unknown:
-        printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
-               class, error, SMB_CMD(req->rq_header));
-        return -EIO;
-}
-/* smb_request_ok: We expect the server to be locked. Then we do the
-   request and check the answer completely. When smb_request_ok
-   returns 0, you can be quite sure that everything went well. When
-   the answer is <=0, the returned number is a valid unix errno. */
-static int
-smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
-{
-        int result;
-        req->rq_resp_wct = wct;
-        req->rq_resp_bcc = bcc;
-        result = smb_add_request(req);
-        if (result != 0) {
-                DEBUG1("smb_request failed\n");
-                goto out;
-        }
-        if (smb_valid_packet(req->rq_header) != 0) {
-                PARANOIA("invalid packet!\n");
-                goto out;
-        }
-        result = smb_verify(req->rq_header, command, wct, bcc);
-out:
-        return result;
-}
-/*
- * This implements the NEWCONN ioctl. It installs the server pid,
- * sets server->state to CONN_VALID, and wakes up the waiting process.
- */
-int
-smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
-{
-        struct file *filp;
-        struct sock *sk;
-        int error;
-        VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
-        smb_lock_server(server);
-        /*
-         * Make sure we don't already have a valid connection ...
-         */
-        error = -EINVAL;
-        if (server->state == CONN_VALID)
-                goto out;
-        error = -EACCES;
-        if (current_uid() != server->mnt->mounted_uid &&
-            !capable(CAP_SYS_ADMIN))
-                goto out;
-        error = -EBADF;
-        filp = fget(opt->fd);
-        if (!filp)
-                goto out;
-        if (!smb_valid_socket(filp->f_path.dentry->d_inode))
-                goto out_putf;
-        server->sock_file = filp;
-        server->conn_pid = get_pid(task_pid(current));
-        server->opt = *opt;
-        server->generation += 1;
-        server->state = CONN_VALID;
-        error = 0;
-        if (server->conn_error) {
-                /*
-                 * conn_error is the returncode we originally decided to
-                 * drop the old connection on. This message should be positive
-                 * and not make people ask questions on why smbfs is printing
-                 * error messages ...
-                 */
-                printk(KERN_INFO "SMB connection re-established (%d)\n",
-                       server->conn_error);
-                server->conn_error = 0;
-        }
-        /*
-         * Store the server in sock user_data (Only used by sunrpc)
-         */
-        sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
-        sk->sk_user_data = server;
-        /* chain into the data_ready callback */
-        server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
-        /* check if we have an old smbmount that uses seconds for the 
-           serverzone */
-        if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
-                server->opt.serverzone /= 60;
-        /* now that we have an established connection we can detect the server
-           type and enable bug workarounds */
-        if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
-                install_ops(server->ops, &smb_ops_core);
-        else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
-                install_ops(server->ops, &smb_ops_os2);
-        else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
-                 (server->opt.max_xmit < 0x1000) &&
-                 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
-                /* FIXME: can we kill the WIN95 flag now? */
-                server->mnt->flags |= SMB_MOUNT_WIN95;
-                VERBOSE("detected WIN95 server\n");
-                install_ops(server->ops, &smb_ops_win95);
-        } else {
-                /*
-                 * Samba has max_xmit 65535
-                 * NT4spX has max_xmit 4536 (or something like that)
-                 * win2k has ...
-                 */
-                VERBOSE("detected NT1 (Samba, NT4/5) server\n");
-                install_ops(server->ops, &smb_ops_winNT);
-        }
-        /* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
-        if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
-                server->ops->getattr = smb_proc_getattr_core;
-        } else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
-                server->ops->getattr = smb_proc_getattr_ff;
-        }
-        /* Decode server capabilities */
-        if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
-                /* Should be ok to set this now, as no one can access the
-                   mount until the connection has been established. */
-                SB_of(server)->s_maxbytes = ~0ULL >> 1;
-                VERBOSE("LFS enabled\n");
-        }
-        if (server->opt.capabilities & SMB_CAP_UNICODE) {
-                server->mnt->flags |= SMB_MOUNT_UNICODE;
-                VERBOSE("Unicode enabled\n");
-        } else {
-                server->mnt->flags &= ~SMB_MOUNT_UNICODE;
-        }
-#if 0
-        /* flags we may test for other patches ... */
-        if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
-                VERBOSE("Large reads enabled\n");
-        }
-        if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
-                VERBOSE("Large writes enabled\n");
-        }
-#endif
-        if (server->opt.capabilities & SMB_CAP_UNIX) {
-                struct inode *inode;
-                VERBOSE("Using UNIX CIFS extensions\n");
-                install_ops(server->ops, &smb_ops_unix);
-                inode = SB_of(server)->s_root->d_inode;
-                if (inode)
-                        inode->i_op = &smb_dir_inode_operations_unix;
-        }
-        VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
-                server->opt.protocol, server->opt.max_xmit,
-                pid_nr(server->conn_pid), server->opt.capabilities);
-        /* FIXME: this really should be done by smbmount. */
-        if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
-                server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
-        }
-        smb_unlock_server(server);
-        smbiod_wake_up();
-        if (server->opt.capabilities & SMB_CAP_UNIX)
-                smb_proc_query_cifsunix(server);
-        server->conn_complete++;
-        wake_up_interruptible_all(&server->conn_wq);
-        return error;
-out:
-        smb_unlock_server(server);
-        smbiod_wake_up();
-        return error;
-out_putf:
-        fput(filp);
-        goto out;
-}
-/* smb_setup_header: We completely set up the packet. You only have to
-   insert the command-specific fields */
-__u8 *
-smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
-{
-        __u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
-        __u8 *p = req->rq_header;
-        struct smb_sb_info *server = req->rq_server;
-        p = smb_encode_smb_length(p, xmit_len - 4);
-        *p++ = 0xff;
-        *p++ = 'S';
-        *p++ = 'M';
-        *p++ = 'B';
-        *p++ = command;
-        memset(p, '\0', 19);
-        p += 19;
-        p += 8;
-        if (server->opt.protocol > SMB_PROTOCOL_CORE) {
-                int flags = SMB_FLAGS_CASELESS_PATHNAMES;
-                int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
-                        SMB_FLAGS2_EXTENDED_ATTRIBUTES; /* EA? not really ... */
-                *(req->rq_header + smb_flg) = flags;
-                if (server->mnt->flags & SMB_MOUNT_UNICODE)
-                        flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
-                WSET(req->rq_header, smb_flg2, flags2);
-        }
-        *p++ = wct;             /* wct */
-        p += 2 * wct;
-        WSET(p, 0, bcc);
-        /* Include the header in the data to send */
-        req->rq_iovlen = 1;
-        req->rq_iov[0].iov_base = req->rq_header;
-        req->rq_iov[0].iov_len  = xmit_len - bcc;
-        return req->rq_buffer;
-}
-static void
-smb_setup_bcc(struct smb_request *req, __u8 *p)
-{
-        u16 bcc = p - req->rq_buffer;
-        u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
-        WSET(pbcc, 0, bcc);
-        smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN + 
-                              2*SMB_WCT(req->rq_header) - 2 + bcc);
-        /* Include the "bytes" in the data to send */
-        req->rq_iovlen = 2;
-        req->rq_iov[1].iov_base = req->rq_buffer;
-        req->rq_iov[1].iov_len  = bcc;
-}
-static int
-smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
-              __u16 mode, off_t offset)
-{
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBlseek, 4, 0);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        WSET(req->rq_header, smb_vwv1, mode);
-        DSET(req->rq_header, smb_vwv2, offset);
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBlseek, 2, 0);
-        if (result < 0) {
-                result = 0;
-                goto out_free;
-        }
-        result = DVAL(req->rq_header, smb_vwv0);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
-{
-        struct inode *ino = dentry->d_inode;
-        struct smb_inode_info *ei = SMB_I(ino);
-        int mode, read_write = 0x42, read_only = 0x40;
-        int res;
-        char *p;
-        struct smb_request *req;
-        /*
-         * Attempt to open r/w, unless there are no write privileges.
-         */
-        mode = read_write;
-        if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
-                mode = read_only;
-#if 0
-        /* FIXME: why is this code not in? below we fix it so that a caller
-           wanting RO doesn't get RW. smb_revalidate_inode does some 
-           optimization based on access mode. tail -f needs it to be correct.
-           We must open rw since we don't do the open if called a second time
-           with different 'wish'. Is that not supported by smb servers? */
-        if (!(wish & (O_WRONLY | O_RDWR)))
-                mode = read_only;
-#endif
-        res = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-      retry:
-        p = smb_setup_header(req, SMBopen, 2, 0);
-        WSET(req->rq_header, smb_vwv0, mode);
-        WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
-        res = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (res < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        res = smb_request_ok(req, SMBopen, 7, 0);
-        if (res != 0) {
-                if (mode == read_write &&
-                    (res == -EACCES || res == -ETXTBSY || res == -EROFS))
-                {
-                        VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
-                                DENTRY_PATH(dentry), res);
-                        mode = read_only;
-                        req->rq_flags = 0;
-                        goto retry;
-                }
-                goto out_free;
-        }
-        /* We should now have data in vwv[0..6]. */
-        ei->fileid = WVAL(req->rq_header, smb_vwv0);
-        ei->attr   = WVAL(req->rq_header, smb_vwv1);
-        /* smb_vwv2 has mtime */
-        /* smb_vwv4 has size  */
-        ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
-        ei->open = server->generation;
-out_free:
-        smb_rput(req);
-out:
-        return res;
-}
-/*
- * Make sure the file is open, and check that the access
- * is compatible with the desired access.
- */
-int
-smb_open(struct dentry *dentry, int wish)
-{
-        struct inode *inode = dentry->d_inode;
-        int result;
-        __u16 access;
-        result = -ENOENT;
-        if (!inode) {
-                printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
-                       DENTRY_PATH(dentry));
-                goto out;
-        }
-        if (!smb_is_open(inode)) {
-                struct smb_sb_info *server = server_from_inode(inode);
-                result = 0;
-                if (!smb_is_open(inode))
-                        result = smb_proc_open(server, dentry, wish);
-                if (result)
-                        goto out;
-                /*
-                 * A successful open means the path is still valid ...
-                 */
-                smb_renew_times(dentry);
-        }
-        /*
-         * Check whether the access is compatible with the desired mode.
-         */
-        result = 0;
-        access = SMB_I(inode)->access;
-        if (access != wish && access != SMB_O_RDWR) {
-                PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
-                         DENTRY_PATH(dentry), access, wish);
-                result = -EACCES;
-        }
-out:
-        return result;
-}
-static int 
-smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
-{
-        struct smb_request *req;
-        int result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBclose, 3, 0);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBclose, 0, 0);
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Win NT 4.0 has an apparent bug in that it fails to update the
- * modify time when writing to a file. As a workaround, we update
- * both modify and access time locally, and post the times to the
- * server when closing the file.
- */
-static int 
-smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
-{
-        struct smb_inode_info *ei = SMB_I(ino);
-        int result = 0;
-        if (smb_is_open(ino))
-        {
-                /*
-                 * We clear the open flag in advance, in case another
-                 * process observes the value while we block below.
-                 */
-                ei->open = 0;
-                /*
-                 * Kludge alert: SMB timestamps are accurate only to
-                 * two seconds ... round the times to avoid needless
-                 * cache invalidations!
-                 */
-                if (ino->i_mtime.tv_sec & 1) { 
-                        ino->i_mtime.tv_sec--;
-                        ino->i_mtime.tv_nsec = 0; 
-                }
-                if (ino->i_atime.tv_sec & 1) {
-                        ino->i_atime.tv_sec--;
-                        ino->i_atime.tv_nsec = 0;
-                }
-                /*
-                 * If the file is open with write permissions,
-                 * update the time stamps to sync mtime and atime.
-                 */
-                if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
-                    (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
-                    !(ei->access == SMB_O_RDONLY))
-                {
-                        struct smb_fattr fattr;
-                        smb_get_inode_attr(ino, &fattr);
-                        smb_proc_setattr_ext(server, ino, &fattr);
-                }
-                result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
-                /*
-                 * Force a revalidation after closing ... some servers
-                 * don't post the size until the file has been closed.
-                 */
-                if (server->opt.protocol < SMB_PROTOCOL_NT1)
-                        ei->oldmtime = 0;
-                ei->closed = jiffies;
-        }
-        return result;
-}
-int
-smb_close(struct inode *ino)
-{
-        int result = 0;
-        if (smb_is_open(ino)) {
-                struct smb_sb_info *server = server_from_inode(ino);
-                result = smb_proc_close_inode(server, ino);
-        }
-        return result;
-}
-/*
- * This is used to close a file following a failed instantiate.
- * Since we don't have an inode, we can't use any of the above.
- */
-int
-smb_close_fileid(struct dentry *dentry, __u16 fileid)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int result;
-        result = smb_proc_close(server, fileid, get_seconds());
-        return result;
-}
-/* In smb_proc_read and smb_proc_write we do not retry, because the
-   file-id would not be valid after a reconnection. */
-static void
-smb_proc_read_data(struct smb_request *req)
-{
-        req->rq_iov[0].iov_base = req->rq_buffer;
-        req->rq_iov[0].iov_len  = 3;
-        req->rq_iov[1].iov_base = req->rq_page;
-        req->rq_iov[1].iov_len  = req->rq_rsize;
-        req->rq_iovlen = 2;
-        req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-static int
-smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        __u16 returned_count, data_len;
-        unsigned char *buf;
-        int result;
-        struct smb_request *req;
-        u8 rbuf[4];
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBread, 5, 0);
-        buf = req->rq_header;
-        WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
-        WSET(buf, smb_vwv1, count);
-        DSET(buf, smb_vwv2, offset);
-        WSET(buf, smb_vwv4, 0);
-        req->rq_page = data;
-        req->rq_rsize = count;
-        req->rq_callback = smb_proc_read_data;
-        req->rq_buffer = rbuf;
-        req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
-        result = smb_request_ok(req, SMBread, 5, -1);
-        if (result < 0)
-                goto out_free;
-        returned_count = WVAL(req->rq_header, smb_vwv0);
-        data_len = WVAL(rbuf, 1);
-        if (returned_count != data_len) {
-                printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
-                printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
-                       returned_count, data_len);
-        }
-        result = data_len;
-out_free:
-        smb_rput(req);
-out:
-        VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, result);
-        return result;
-}
-static int
-smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result;
-        u16 fileid = SMB_I(inode)->fileid;
-        u8 buf[4];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-                inode->i_ino, fileid, count, offset);
-        smb_setup_header(req, SMBwrite, 5, count + 3);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        WSET(req->rq_header, smb_vwv1, count);
-        DSET(req->rq_header, smb_vwv2, offset);
-        WSET(req->rq_header, smb_vwv4, 0);
-        buf[0] = 1;
-        WSET(buf, 1, count);    /* yes, again ... */
-        req->rq_iov[1].iov_base = buf;
-        req->rq_iov[1].iov_len = 3;
-        req->rq_iov[2].iov_base = (char *) data;
-        req->rq_iov[2].iov_len = count;
-        req->rq_iovlen = 3;
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBwrite, 1, 0);
-        if (result >= 0)
-                result = WVAL(req->rq_header, smb_vwv0);
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * In smb_proc_readX and smb_proc_writeX we do not retry, because the
- * file-id would not be valid after a reconnection.
- */
-#define SMB_READX_MAX_PAD      64
-static void
-smb_proc_readX_data(struct smb_request *req)
-{
-        /* header length, excluding the netbios length (-4) */
-        int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-        int data_off = WVAL(req->rq_header, smb_vwv6);
-        /*
-         * Some genius made the padding to the data bytes arbitrary.
-         * So we must first calculate the amount of padding used by the server.
-         */
-        data_off -= hdrlen;
-        if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
-                PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
-                PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
-                req->rq_rlen = req->rq_bufsize + 1;
-                return;
-        }
-        req->rq_iov[0].iov_base = req->rq_buffer;
-        req->rq_iov[0].iov_len  = data_off;
-        req->rq_iov[1].iov_base = req->rq_page;
-        req->rq_iov[1].iov_len  = req->rq_rsize;
-        req->rq_iovlen = 2;
-        req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-static int
-smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        unsigned char *buf;
-        int result;
-        struct smb_request *req;
-        static char pad[SMB_READX_MAX_PAD];
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBreadX, 12, 0);
-        buf = req->rq_header;
-        WSET(buf, smb_vwv0, 0x00ff);
-        WSET(buf, smb_vwv1, 0);
-        WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
-        DSET(buf, smb_vwv3, (u32)offset);               /* low 32 bits */
-        WSET(buf, smb_vwv5, count);
-        WSET(buf, smb_vwv6, 0);
-        DSET(buf, smb_vwv7, 0);
-        WSET(buf, smb_vwv9, 0);
-        DSET(buf, smb_vwv10, (u32)(offset >> 32));      /* high 32 bits */
-        WSET(buf, smb_vwv11, 0);
-        req->rq_page = data;
-        req->rq_rsize = count;
-        req->rq_callback = smb_proc_readX_data;
-        req->rq_buffer = pad;
-        req->rq_bufsize = SMB_READX_MAX_PAD;
-        req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBreadX, 12, -1);
-        if (result < 0)
-                goto out_free;
-        result = WVAL(req->rq_header, smb_vwv5);
-out_free:
-        smb_rput(req);
-out:
-        VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, result);
-        return result;
-}
-static int
-smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result;
-        u8 *p;
-        static u8 pad[4];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, offset);
-        p = smb_setup_header(req, SMBwriteX, 14, count + 1);
-        WSET(req->rq_header, smb_vwv0, 0x00ff);
-        WSET(req->rq_header, smb_vwv1, 0);
-        WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
-        DSET(req->rq_header, smb_vwv3, (u32)offset);    /* low 32 bits */
-        DSET(req->rq_header, smb_vwv5, 0);
-        WSET(req->rq_header, smb_vwv7, 0);              /* write mode */
-        WSET(req->rq_header, smb_vwv8, 0);
-        WSET(req->rq_header, smb_vwv9, 0);
-        WSET(req->rq_header, smb_vwv10, count);         /* data length */
-        WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
-        DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
-        req->rq_iov[1].iov_base = pad;
-        req->rq_iov[1].iov_len = 1;
-        req->rq_iov[2].iov_base = (char *) data;
-        req->rq_iov[2].iov_len = count;
-        req->rq_iovlen = 3;
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBwriteX, 6, 0);
-        if (result >= 0)
-                result = WVAL(req->rq_header, smb_vwv2);
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBcreate, 3, 0);
-        WSET(req->rq_header, smb_vwv0, attr);
-        DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        result = smb_request_ok(req, SMBcreate, 1, 0);
-        if (result < 0)
-                goto out_free;
-        *fileid = WVAL(req->rq_header, smb_vwv0);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
-{
-        struct smb_sb_info *server = server_from_dentry(old_dentry);
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBmv, 1, 0);
-        WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
-        result = smb_simple_encode_path(req, &p, old_dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        result = smb_simple_encode_path(req, &p, new_dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Code common to mkdir and rmdir.
- */
-static int
-smb_proc_generic_command(struct dentry *dentry, __u8 command)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, command, 0, 0);
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        result = smb_request_ok(req, command, 0, 0);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_mkdir(struct dentry *dentry)
-{
-        return smb_proc_generic_command(dentry, SMBmkdir);
-}
-int
-smb_proc_rmdir(struct dentry *dentry)
-{
-        return smb_proc_generic_command(dentry, SMBrmdir);
-}
-#if SMBFS_POSIX_UNLINK
-/*
- * Removes readonly attribute from a file. Used by unlink to give posix
- * semantics.
- */
-static int
-smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
-{
-        int result;
-        struct smb_fattr fattr;
-        /* FIXME: cifsUE should allow removing a readonly file. */
-        /* first get current attribute */
-        smb_init_dirent(server, &fattr);
-        result = server->ops->getattr(server, dentry, &fattr);
-        smb_finish_dirent(server, &fattr);
-        if (result < 0)
-                return result;
-        /* if RONLY attribute is set, remove it */
-        if (fattr.attr & aRONLY) {  /* read only attribute is set */
-                fattr.attr &= ~aRONLY;
-                result = smb_proc_setattr_core(server, dentry, fattr.attr);
-        }
-        return result;
-}
-#endif
-int
-smb_proc_unlink(struct dentry *dentry)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int flag = 0;
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-      retry:
-        p = smb_setup_header(req, SMBunlink, 1, 0);
-        WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
-#if SMBFS_POSIX_UNLINK
-                if (result == -EACCES && !flag) {
-                        /* Posix semantics is for the read-only state
-                           of a file to be ignored in unlink(). In the
-                           SMB world a unlink() is refused on a
-                           read-only file. To make things easier for
-                           unix users we try to override the files
-                           permission if the unlink fails with the
-                           right error.
-                           This introduces a race condition that could
-                           lead to a file being written by someone who
-                           shouldn't have access, but as far as I can
-                           tell that is unavoidable */
-                        /* remove RONLY attribute and try again */
-                        result = smb_set_rw(dentry,server);
-                        if (result == 0) {
-                                flag = 1;
-                                req->rq_flags = 0;
-                                goto retry;
-                        }
-                }
-#endif
-                goto out_free;
-        }
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
-{
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBflush, 1, 0);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBflush, 0, 0);
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_trunc32(struct inode *inode, loff_t length)
-{
-        /*
-         * Writing 0bytes is old-SMB magic for truncating files.
-         * MAX_NON_LFS should prevent this from being called with a too
-         * large offset.
-         */
-        return smb_proc_write(inode, length, 0, NULL);
-}
-static int
-smb_proc_trunc64(struct inode *inode, loff_t length)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result;
-        char *param;
-        char *data;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 14)))
-                goto out;
-        param = req->rq_buffer;
-        data = req->rq_buffer + 6;
-        /* FIXME: must we also set allocation size? winNT seems to do that */
-        WSET(param, 0, SMB_I(inode)->fileid);
-        WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
-        WSET(param, 4, 0);
-        LSET(data, 0, length);
-        req->rq_trans2_command = TRANSACT2_SETFILEINFO;
-        req->rq_ldata = 8;
-        req->rq_data  = data;
-        req->rq_lparm = 6;
-        req->rq_parm  = param;
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-        if (req->rq_rcls != 0)
-                result = smb_errno(req);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_trunc95(struct inode *inode, loff_t length)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result = smb_proc_trunc32(inode, length);
- 
-        /*
-         * win9x doesn't appear to update the size immediately.
-         * It will return the old file size after the truncate,
-         * confusing smbfs. So we force an update.
-         *
-         * FIXME: is this still necessary?
-         */
-        smb_proc_flush(server, SMB_I(inode)->fileid);
-        return result;
-}
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-        memset(fattr, 0, sizeof(*fattr));
-        fattr->f_nlink = 1;
-        fattr->f_uid = server->mnt->uid;
-        fattr->f_gid = server->mnt->gid;
-        fattr->f_unix = 0;
-}
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-        if (fattr->f_unix)
-                return;
-        fattr->f_mode = server->mnt->file_mode;
-        if (fattr->attr & aDIR) {
-                fattr->f_mode = server->mnt->dir_mode;
-                fattr->f_size = SMB_ST_BLKSIZE;
-        }
-        /* Check the read-only flag */
-        if (fattr->attr & aRONLY)
-                fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
-        /* How many 512 byte blocks do we need for this file? */
-        fattr->f_blocks = 0;
-        if (fattr->f_size != 0)
-                fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
-        return;
-}
-void
-smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-                     struct super_block *sb)
-{
-        smb_init_dirent(server, fattr);
-        fattr->attr = aDIR;
-        fattr->f_ino = 2; /* traditional root inode number */
-        fattr->f_mtime = current_fs_time(sb);
-        smb_finish_dirent(server, fattr);
-}
-/*
- * Decode a dirent for old protocols
- *
- * qname is filled with the decoded, and possibly translated, name.
- * fattr receives decoded attributes
- *
- * Bugs Noted:
- * (1) Pathworks servers may pad the name with extra spaces.
- */
-static char *
-smb_decode_short_dirent(struct smb_sb_info *server, char *p,
-                        struct qstr *qname, struct smb_fattr *fattr,
-                        unsigned char *name_buf)
-{
-        int len;
-        /*
-         * SMB doesn't have a concept of inode numbers ...
-         */
-        smb_init_dirent(server, fattr);
-        fattr->f_ino = 0;       /* FIXME: do we need this? */
-        p += SMB_STATUS_SIZE;   /* reserved (search_status) */
-        fattr->attr = *p;
-        fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
-        fattr->f_mtime.tv_nsec = 0;
-        fattr->f_size = DVAL(p, 5);
-        fattr->f_ctime = fattr->f_mtime;
-        fattr->f_atime = fattr->f_mtime;
-        qname->name = p + 9;
-        len = strnlen(qname->name, 12);
-        /*
-         * Trim trailing blanks for Pathworks servers
-         */
-        while (len > 2 && qname->name[len-1] == ' ')
-                len--;
-        smb_finish_dirent(server, fattr);
-#if 0
-        /* FIXME: These only work for ascii chars, and recent smbmount doesn't
-           allow the flag to be set anyway. It kills const. Remove? */
-        switch (server->opt.case_handling) {
-        case SMB_CASE_UPPER:
-                str_upper(entry->name, len);
-                break;
-        case SMB_CASE_LOWER:
-                str_lower(entry->name, len);
-                break;
-        default:
-                break;
-        }
-#endif
-        qname->len = 0;
-        len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-                                   qname->name, len,
-                                   server->remote_nls, server->local_nls);
-        if (len > 0) {
-                qname->len = len;
-                qname->name = name_buf;
-                DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
-        }
-        return p + 22;
-}
-/*
- * This routine is used to read in directory entries from the network.
- * Note that it is for short directory name seeks, i.e.: protocol <
- * SMB_PROTOCOL_LANMAN2
- */
-static int
-smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
-                       struct smb_cache_control *ctl)
-{
-        struct dentry *dir = filp->f_path.dentry;
-        struct smb_sb_info *server = server_from_dentry(dir);
-        struct qstr qname;
-        struct smb_fattr fattr;
-        char *p;
-        int result;
-        int i, first, entries_seen, entries;
-        int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
-        __u16 bcc;
-        __u16 count;
-        char status[SMB_STATUS_SIZE];
-        static struct qstr mask = {
-                .name   = "*.*",
-                .len    = 3,
-        };
-        unsigned char *last_status;
-        struct smb_request *req;
-        unsigned char *name_buf;
-        VERBOSE("%s/%s\n", DENTRY_PATH(dir));
-        lock_kernel();
-        result = -ENOMEM;
-        if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
-                goto out;
-        first = 1;
-        entries = 0;
-        entries_seen = 2; /* implicit . and .. */
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-                goto out_name;
-        while (1) {
-                p = smb_setup_header(req, SMBsearch, 2, 0);
-                WSET(req->rq_header, smb_vwv0, entries_asked);
-                WSET(req->rq_header, smb_vwv1, aDIR);
-                if (first == 1) {
-                        result = smb_simple_encode_path(req, &p, dir, &mask);
-                        if (result < 0)
-                                goto out_free;
-                        if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
-                                result = -ENAMETOOLONG;
-                                goto out_free;
-                        }
-                        *p++ = 5;
-                        WSET(p, 0, 0);
-                        p += 2;
-                        first = 0;
-                } else {
-                        if (p + 5 + SMB_STATUS_SIZE >
-                            (char *)req->rq_buffer + req->rq_bufsize) {
-                                result = -ENAMETOOLONG;
-                                goto out_free;
-                        }
-                                
-                        *p++ = 4;
-                        *p++ = 0;
-                        *p++ = 5;
-                        WSET(p, 0, SMB_STATUS_SIZE);
-                        p += 2;
-                        memcpy(p, status, SMB_STATUS_SIZE);
-                        p += SMB_STATUS_SIZE;
-                }
-                smb_setup_bcc(req, p);
-                result = smb_request_ok(req, SMBsearch, 1, -1);
-                if (result < 0) {
-                        if ((req->rq_rcls == ERRDOS) && 
-                            (req->rq_err  == ERRnofiles))
-                                break;
-                        goto out_free;
-                }
-                count = WVAL(req->rq_header, smb_vwv0);
-                if (count <= 0)
-                        break;
-                result = -EIO;
-                bcc = smb_bcc(req->rq_header);
-                if (bcc != count * SMB_DIRINFO_SIZE + 3)
-                        goto out_free;
-                p = req->rq_buffer + 3;
-                /* Make sure the response fits in the buffer. Fixed sized 
-                   entries means we don't have to check in the decode loop. */
-                last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
-                if (last_status + SMB_DIRINFO_SIZE >=
-                    req->rq_buffer + req->rq_bufsize) {
-                        printk(KERN_ERR "smb_proc_readdir_short: "
-                               "last dir entry outside buffer! "
-                               "%d@%p  %d@%p\n", SMB_DIRINFO_SIZE, last_status,
-                               req->rq_bufsize, req->rq_buffer);
-                        goto out_free;
-                }
-                /* Read the last entry into the status field. */
-                memcpy(status, last_status, SMB_STATUS_SIZE);
-                /* Now we are ready to parse smb directory entries. */
-                for (i = 0; i < count; i++) {
-                        p = smb_decode_short_dirent(server, p, 
-                                                    &qname, &fattr, name_buf);
-                        if (qname.len == 0)
-                                continue;
-                        if (entries_seen == 2 && qname.name[0] == '.') {
-                                if (qname.len == 1)
-                                        continue;
-                                if (qname.name[1] == '.' && qname.len == 2)
-                                        continue;
-                        }
-                        if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-                                            &qname, &fattr))
-                                ;       /* stop reading? */
-                        entries_seen++;
-                }
-        }
-        result = entries;
-out_free:
-        smb_rput(req);
-out_name:
-        kfree(name_buf);
-out:
-        unlock_kernel();
-        return result;
-}
-static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
-{
-        u64 size, disk_bytes;
-        /* FIXME: verify nls support. all is sent as utf8? */
-        fattr->f_unix = 1;
-        fattr->f_mode = 0;
-        /* FIXME: use the uniqueID from the remote instead? */
-        /* 0 L file size in bytes */
-        /* 8 L file size on disk in bytes (block count) */
-        /* 40 L uid */
-        /* 48 L gid */
-        /* 56 W file type */
-        /* 60 L devmajor */
-        /* 68 L devminor */
-        /* 76 L unique ID (inode) */
-        /* 84 L permissions */
-        /* 92 L link count */
-        size = LVAL(p, 0);
-        disk_bytes = LVAL(p, 8);
-        /*
-         * Some samba versions round up on-disk byte usage
-         * to 1MB boundaries, making it useless. When seeing
-         * that, use the size instead.
-         */
-        if (!(disk_bytes & 0xfffff))
-                disk_bytes = size+511;
-        fattr->f_size = size;
-        fattr->f_blocks = disk_bytes >> 9;
-        fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
-        fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
-        fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
-        if (server->mnt->flags & SMB_MOUNT_UID)
-                fattr->f_uid = server->mnt->uid;
-        else
-                fattr->f_uid = LVAL(p, 40);
-        if (server->mnt->flags & SMB_MOUNT_GID)
-                fattr->f_gid = server->mnt->gid;
-        else
-                fattr->f_gid = LVAL(p, 48);
-        fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
-        if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
-                __u64 major = LVAL(p, 60);
-                __u64 minor = LVAL(p, 68);
-                fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
-                if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
-                MINOR(fattr->f_rdev) != (minor & 0xffffffff))
-                        fattr->f_rdev = 0;
-        }
-        fattr->f_mode |= LVAL(p, 84);
-        if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
-             (S_ISDIR(fattr->f_mode)) )
-                fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-        else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
-                  !(S_ISDIR(fattr->f_mode)) )
-                fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
-                                (fattr->f_mode & S_IFMT);
-}
-/*
- * Interpret a long filename structure using the specified info level:
- *   level 1 for anything below NT1 protocol
- *   level 260 for NT1 protocol
- *
- * qname is filled with the decoded, and possibly translated, name
- * fattr receives decoded attributes.
- *
- * Bugs Noted:
- * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
- */
-static char *
-smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
-                       struct qstr *qname, struct smb_fattr *fattr,
-                       unsigned char *name_buf)
-{
-        char *result;
-        unsigned int len = 0;
-        int n;
-        __u16 date, time;
-        int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-        /*
-         * SMB doesn't have a concept of inode numbers ...
-         */
-        smb_init_dirent(server, fattr);
-        fattr->f_ino = 0;       /* FIXME: do we need this? */
-        switch (level) {
-        case 1:
-                len = *((unsigned char *) p + 22);
-                qname->name = p + 23;
-                result = p + 24 + len;
-                date = WVAL(p, 0);
-                time = WVAL(p, 2);
-                fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-                fattr->f_ctime.tv_nsec = 0;
-                date = WVAL(p, 4);
-                time = WVAL(p, 6);
-                fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-                fattr->f_atime.tv_nsec = 0;
-                date = WVAL(p, 8);
-                time = WVAL(p, 10);
-                fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-                fattr->f_mtime.tv_nsec = 0;
-                fattr->f_size = DVAL(p, 12);
-                /* ULONG allocation size */
-                fattr->attr = WVAL(p, 20);
-                VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
-                        p, len, len, qname->name);
-                break;
-        case 260:
-                result = p + WVAL(p, 0);
-                len = DVAL(p, 60);
-                if (len > 255) len = 255;
-                /* NT4 null terminates, unless we are using unicode ... */
-                qname->name = p + 94;
-                if (!unicode && len && qname->name[len-1] == '\0')
-                        len--;
-                fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
-                fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
-                fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
-                /* change time (32) */
-                fattr->f_size = LVAL(p, 40);
-                /* alloc size (48) */
-                fattr->attr = DVAL(p, 56);
-                VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
-                        p, len, len, qname->name);
-                break;
-        case SMB_FIND_FILE_UNIX:
-                result = p + WVAL(p, 0);
-                qname->name = p + 108;
-                len = strlen(qname->name);
-                /* FIXME: should we check the length?? */
-                p += 8;
-                smb_decode_unix_basic(fattr, server, p);
-                VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
-                        p, len, len, qname->name);
-                break;
-        default:
-                PARANOIA("Unknown info level %d\n", level);
-                result = p + WVAL(p, 0);
-                goto out;
-        }
-        smb_finish_dirent(server, fattr);
-#if 0
-        /* FIXME: These only work for ascii chars, and recent smbmount doesn't
-           allow the flag to be set anyway. Remove? */
-        switch (server->opt.case_handling) {
-        case SMB_CASE_UPPER:
-                str_upper(qname->name, len);
-                break;
-        case SMB_CASE_LOWER:
-                str_lower(qname->name, len);
-                break;
-        default:
-                break;
-        }
-#endif
-        qname->len = 0;
-        n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-                                 qname->name, len,
-                                 server->remote_nls, server->local_nls);
-        if (n > 0) {
-                qname->len = n;
-                qname->name = name_buf;
-        }
-out:
-        return result;
-}
-/* findfirst/findnext flags */
-#define SMB_CLOSE_AFTER_FIRST (1<<0)
-#define SMB_CLOSE_IF_END (1<<1)
-#define SMB_REQUIRE_RESUME_KEY (1<<2)
-#define SMB_CONTINUE_BIT (1<<3)
-/*
- * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
- * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
- * go there for advise.
- *
- * Bugs Noted:
- * (1) When using Info Level 1 Win NT 4.0 truncates directory listings 
- * for certain patterns of names and/or lengths. The breakage pattern
- * is completely reproducible and can be toggled by the creation of a
- * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
- */
-static int
-smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
-                      struct smb_cache_control *ctl)
-{
-        struct dentry *dir = filp->f_path.dentry;
-        struct smb_sb_info *server = server_from_dentry(dir);
-        struct qstr qname;
-        struct smb_fattr fattr;
-        unsigned char *p, *lastname;
-        char *mask, *param;
-        __u16 command;
-        int first, entries_seen;
-        /* Both NT and OS/2 accept info level 1 (but see note below). */
-        int info_level = 260;
-        const int max_matches = 512;
-        unsigned int ff_searchcount = 0;
-        unsigned int ff_eos = 0;
-        unsigned int ff_lastname = 0;
-        unsigned int ff_dir_handle = 0;
-        unsigned int loop_count = 0;
-        unsigned int mask_len, i;
-        int result;
-        struct smb_request *req;
-        unsigned char *name_buf;
-        static struct qstr star = {
-                .name   = "*",
-                .len    = 1,
-        };
-        lock_kernel();
-        /*
-         * We always prefer unix style. Use info level 1 for older
-         * servers that don't do 260.
-         */
-        if (server->opt.capabilities & SMB_CAP_UNIX)
-                info_level = SMB_FIND_FILE_UNIX;
-        else if (server->opt.protocol < SMB_PROTOCOL_NT1)
-                info_level = 1;
-        result = -ENOMEM;
-        if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
-                goto out;
-        if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-                goto out_name;
-        param = req->rq_buffer;
-        /*
-         * Encode the initial path
-         */
-        mask = param + 12;
-        result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
-        if (result <= 0)
-                goto out_free;
-        mask_len = result - 1;  /* mask_len is strlen, not #bytes */
-        result = 0;
-        first = 1;
-        VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
-        entries_seen = 2;
-        ff_eos = 0;
-        while (ff_eos == 0) {
-                loop_count += 1;
-                if (loop_count > 10) {
-                        printk(KERN_WARNING "smb_proc_readdir_long: "
-                               "Looping in FIND_NEXT??\n");
-                        result = -EIO;
-                        break;
-                }
-                if (first != 0) {
-                        command = TRANSACT2_FINDFIRST;
-                        WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-                        WSET(param, 2, max_matches);    /* max count */
-                        WSET(param, 4, SMB_CLOSE_IF_END);
-                        WSET(param, 6, info_level);
-                        DSET(param, 8, 0);
-                } else {
-                        command = TRANSACT2_FINDNEXT;
-                        VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
-                                ff_dir_handle, ff_lastname, mask_len, mask);
-                        WSET(param, 0, ff_dir_handle);  /* search handle */
-                        WSET(param, 2, max_matches);    /* max count */
-                        WSET(param, 4, info_level);
-                        DSET(param, 6, 0);
-                        WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
-                }
-                req->rq_trans2_command = command;
-                req->rq_ldata = 0;
-                req->rq_data  = NULL;
-                req->rq_lparm = 12 + mask_len + 1;
-                req->rq_parm  = param;
-                req->rq_flags = 0;
-                result = smb_add_request(req);
-                if (result < 0) {
-                        PARANOIA("error=%d, breaking\n", result);
-                        break;
-                }
-                if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
-                        /* a damn Win95 bug - sometimes it clags if you 
-                           ask it too fast */
-                        schedule_timeout_interruptible(msecs_to_jiffies(200));
-                        continue;
-                }
-                if (req->rq_rcls != 0) {
-                        result = smb_errno(req);
-                        PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
-                                 mask, result, req->rq_rcls, req->rq_err);
-                        break;
-                }
-                /* parse out some important return info */
-                if (first != 0) {
-                        ff_dir_handle = WVAL(req->rq_parm, 0);
-                        ff_searchcount = WVAL(req->rq_parm, 2);
-                        ff_eos = WVAL(req->rq_parm, 4);
-                        ff_lastname = WVAL(req->rq_parm, 8);
-                } else {
-                        ff_searchcount = WVAL(req->rq_parm, 0);
-                        ff_eos = WVAL(req->rq_parm, 2);
-                        ff_lastname = WVAL(req->rq_parm, 6);
-                }
-                if (ff_searchcount == 0)
-                        break;
-                /* Now we are ready to parse smb directory entries. */
-                /* point to the data bytes */
-                p = req->rq_data;
-                for (i = 0; i < ff_searchcount; i++) {
-                        /* make sure we stay within the buffer */
-                        if (p >= req->rq_data + req->rq_ldata) {
-                                printk(KERN_ERR "smb_proc_readdir_long: "
-                                       "dirent pointer outside buffer! "
-                                       "%p  %d@%p\n",
-                                       p, req->rq_ldata, req->rq_data);
-                                result = -EIO; /* always a comm. error? */
-                                goto out_free;
-                        }
-                        p = smb_decode_long_dirent(server, p, info_level,
-                                                   &qname, &fattr, name_buf);
-                        /* ignore . and .. from the server */
-                        if (entries_seen == 2 && qname.name[0] == '.') {
-                                if (qname.len == 1)
-                                        continue;
-                                if (qname.name[1] == '.' && qname.len == 2)
-                                        continue;
-                        }
-                        if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-                                            &qname, &fattr))
-                                ;       /* stop reading? */
-                        entries_seen++;
-                }
-                VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
-                /*
-                 * We might need the lastname for continuations.
-                 *
-                 * Note that some servers (win95?) point to the filename and
-                 * others (NT4, Samba using NT1) to the dir entry. We assume
-                 * here that those who do not point to a filename do not need
-                 * this info to continue the listing.
-                 *
-                 * OS/2 needs this and talks infolevel 1.
-                 * NetApps want lastname with infolevel 260.
-                 * win2k want lastname with infolevel 260, and points to
-                 *       the record not to the name.
-                 * Samba+CifsUnixExt doesn't need lastname.
-                 *
-                 * Both are happy if we return the data they point to. So we do.
-                 * (FIXME: above is not true with win2k)
-                 */
-                mask_len = 0;
-                if (info_level != SMB_FIND_FILE_UNIX &&
-                    ff_lastname > 0 && ff_lastname < req->rq_ldata) {
-                        lastname = req->rq_data + ff_lastname;
-                        switch (info_level) {
-                        case 260:
-                                mask_len = req->rq_ldata - ff_lastname;
-                                break;
-                        case 1:
-                                /* lastname points to a length byte */
-                                mask_len = *lastname++;
-                                if (ff_lastname + 1 + mask_len > req->rq_ldata)
-                                        mask_len = req->rq_ldata - ff_lastname - 1;
-                                break;
-                        }
-                        /*
-                         * Update the mask string for the next message.
-                         */
-                        if (mask_len > 255)
-                                mask_len = 255;
-                        if (mask_len)
-                                strncpy(mask, lastname, mask_len);
-                }
-                mask_len = strnlen(mask, mask_len);
-                VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
-                        mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
-                first = 0;
-                loop_count = 0;
-        }
-out_free:
-        smb_rput(req);
-out_name:
-        kfree(name_buf);
-out:
-        unlock_kernel();
-        return result;
-}
-/*
- * This version uses the trans2 TRANSACT2_FINDFIRST message 
- * to get the attribute data.
- *
- * Bugs Noted:
- */
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-                        struct smb_fattr *fattr)
-{
-        char *param, *mask;
-        __u16 date, time;
-        int mask_len, result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        mask = param + 12;
-        mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
-        if (mask_len < 0) {
-                result = mask_len;
-                goto out_free;
-        }
-        VERBOSE("name=%s, len=%d\n", mask, mask_len);
-        WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-        WSET(param, 2, 1);      /* max count */
-        WSET(param, 4, 1);      /* close after this call */
-        WSET(param, 6, 1);      /* info_level */
-        DSET(param, 8, 0);
-        req->rq_trans2_command = TRANSACT2_FINDFIRST;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = 12 + mask_len;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        if (req->rq_rcls != 0) {
-                result = smb_errno(req);
-#ifdef SMBFS_PARANOIA
-                if (result != -ENOENT)
-                        PARANOIA("error for %s, rcls=%d, err=%d\n",
-                                 mask, req->rq_rcls, req->rq_err);
-#endif
-                goto out_free;
-        }
-        /* Make sure we got enough data ... */
-        result = -EINVAL;
-        if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
-                PARANOIA("bad result for %s, len=%d, count=%d\n",
-                         mask, req->rq_ldata, WVAL(req->rq_parm, 2));
-                goto out_free;
-        }
-        /*
-         * Decode the response into the fattr ...
-         */
-        date = WVAL(req->rq_data, 0);
-        time = WVAL(req->rq_data, 2);
-        fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-        fattr->f_ctime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 4);
-        time = WVAL(req->rq_data, 6);
-        fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-        fattr->f_atime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 8);
-        time = WVAL(req->rq_data, 10);
-        fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-        fattr->f_mtime.tv_nsec = 0;
-        VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
-                mask, date, time, fattr->f_mtime.tv_sec);
-        fattr->f_size = DVAL(req->rq_data, 12);
-        /* ULONG allocation size */
-        fattr->attr = WVAL(req->rq_data, 20);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-                      struct smb_fattr *fattr)
-{
-        int result;
-        char *p;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBgetatr, 0, 0);
-        result = smb_simple_encode_path(req, &p, dir, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
-                goto out_free;
-        fattr->attr    = WVAL(req->rq_header, smb_vwv0);
-        fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
-        fattr->f_mtime.tv_nsec = 0;
-        fattr->f_size  = DVAL(req->rq_header, smb_vwv3);
-        fattr->f_ctime = fattr->f_mtime; 
-        fattr->f_atime = fattr->f_mtime; 
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk("getattr_core: %s/%s, mtime=%ld\n",
-               DENTRY_PATH(dir), fattr->f_mtime);
-#endif
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Bugs Noted:
- * (1) Win 95 swaps the date and time fields in the standard info level.
- */
-static int
-smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
-                        struct smb_request *req, int infolevel)
-{
-        char *p, *param;
-        int result;
-        param = req->rq_buffer;
-        WSET(param, 0, infolevel);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-        if (result < 0)
-                goto out;
-        p = param + 6 + result;
-        req->rq_trans2_command = TRANSACT2_QPATHINFO;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out;
-        if (req->rq_rcls != 0) {
-                VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
-                        &param[6], result, req->rq_rcls, req->rq_err);
-                result = smb_errno(req);
-                goto out;
-        }
-        result = -ENOENT;
-        if (req->rq_ldata < 22) {
-                PARANOIA("not enough data for %s, len=%d\n",
-                         &param[6], req->rq_ldata);
-                goto out;
-        }
-        result = 0;
-out:
-        return result;
-}
-static int
-smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
-                            struct smb_fattr *attr)
-{
-        u16 date, time;
-        int off_date = 0, off_time = 2;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
-        if (result < 0)
-                goto out_free;
-        /*
-         * Kludge alert: Win 95 swaps the date and time field,
-         * contrary to the CIFS docs and Win NT practice.
-         */
-        if (server->mnt->flags & SMB_MOUNT_WIN95) {
-                off_date = 2;
-                off_time = 0;
-        }
-        date = WVAL(req->rq_data, off_date);
-        time = WVAL(req->rq_data, off_time);
-        attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-        attr->f_ctime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 4 + off_date);
-        time = WVAL(req->rq_data, 4 + off_time);
-        attr->f_atime.tv_sec = date_dos2unix(server, date, time);
-        attr->f_atime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 8 + off_date);
-        time = WVAL(req->rq_data, 8 + off_time);
-        attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-        attr->f_mtime.tv_nsec = 0;
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
-               DENTRY_PATH(dir), date, time, attr->f_mtime);
-#endif
-        attr->f_size = DVAL(req->rq_data, 12);
-        attr->attr = WVAL(req->rq_data, 20);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
-                            struct smb_fattr *attr)
-{
-        struct smb_request *req;
-        int result;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        result = smb_proc_getattr_trans2(server, dir, req,
-                                         SMB_QUERY_FILE_ALL_INFO);
-        if (result < 0)
-                goto out_free;
-        attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
-        attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
-        attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
-        /* change (24) */
-        attr->attr = WVAL(req->rq_data, 32);
-        /* pad? (34) */
-        /* allocated size (40) */
-        attr->f_size = LVAL(req->rq_data, 48);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
-                      struct smb_fattr *attr)
-{
-        struct smb_request *req;
-        int result;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        result = smb_proc_getattr_trans2(server, dir, req,
-                                         SMB_QUERY_FILE_UNIX_BASIC);
-        if (result < 0)
-                goto out_free;
-        smb_decode_unix_basic(attr, server, req->rq_data);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
-                    struct smb_fattr *attr)
-{
-        struct inode *inode = dir->d_inode;
-        int result;
-        /* FIXME: why not use the "all" version? */
-        result = smb_proc_getattr_trans2_std(server, dir, attr);
-        if (result < 0)
-                goto out;
-        /*
-         * None of the getattr versions here can make win9x return the right
-         * filesize if there are changes made to an open file.
-         * A seek-to-end does return the right size, but we only need to do
-         * that on files we have written.
-         */
-        if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
-            smb_is_open(inode))
-        {
-                __u16 fileid = SMB_I(inode)->fileid;
-                attr->f_size = smb_proc_seek(server, fileid, 2, 0);
-        }
-out:
-        return result;
-}
-static int
-smb_proc_ops_wait(struct smb_sb_info *server)
-{
-        int result;
-        result = wait_event_interruptible_timeout(server->conn_wq,
-                                server->conn_complete, 30*HZ);
-        if (!result || signal_pending(current))
-                return -EIO;
-        return 0;
-}
-static int
-smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
-                          struct smb_fattr *fattr)
-{
-        int result;
-        if (smb_proc_ops_wait(server) < 0)
-                return -EIO;
-        smb_init_dirent(server, fattr);
-        result = server->ops->getattr(server, dir, fattr);
-        smb_finish_dirent(server, fattr);
-        return result;
-}
-static int
-smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
-                      struct smb_cache_control *ctl)
-{
-        struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
-        if (smb_proc_ops_wait(server) < 0)
-                return -EIO;
-        return server->ops->readdir(filp, dirent, filldir, ctl);
-}
-int
-smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = server_from_dentry(dir);
-        int result;
-        smb_init_dirent(server, fattr);
-        result = server->ops->getattr(server, dir, fattr);
-        smb_finish_dirent(server, fattr);
-        return result;
-}
-/*
- * Because of bugs in the core protocol, we use this only to set
- * attributes. See smb_proc_settime() below for timestamp handling.
- *
- * Bugs Noted:
- * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
- * with an undocumented error (ERRDOS code 50). Setting
- * mtime to 0 allows the attributes to be set.
- * (2) The extra parameters following the name string aren't
- * in the CIFS docs, but seem to be necessary for operation.
- */
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-                      __u16 attr)
-{
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBsetatr, 8, 0);
-        WSET(req->rq_header, smb_vwv0, attr);
-        DSET(req->rq_header, smb_vwv1, 0); /* mtime */
-        WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
-        WSET(req->rq_header, smb_vwv4, 0);
-        WSET(req->rq_header, smb_vwv5, 0);
-        WSET(req->rq_header, smb_vwv6, 0);
-        WSET(req->rq_header, smb_vwv7, 0);
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
-                result = -ENAMETOOLONG;
-                goto out_free;
-        }
-        *p++ = 4;
-        *p++ = 0;
-        smb_setup_bcc(req, p);
-        result = smb_request_ok(req, SMBsetatr, 0, 0);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Because of bugs in the trans2 setattr messages, we must set
- * attributes and timestamps separately. The core SMBsetatr
- * message seems to be the only reliable way to set attributes.
- */
-int
-smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = server_from_dentry(dir);
-        int result;
-        VERBOSE("setting %s/%s, open=%d\n", 
-                DENTRY_PATH(dir), smb_is_open(dir->d_inode));
-        result = smb_proc_setattr_core(server, dir, fattr->attr);
-        return result;
-}
-/*
- * Sets the timestamps for an file open with write permissions.
- */
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-                      struct inode *inode, struct smb_fattr *fattr)
-{
-        __u16 date, time;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBsetattrE, 7, 0);
-        WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
-        /* We don't change the creation time */
-        WSET(req->rq_header, smb_vwv1, 0);
-        WSET(req->rq_header, smb_vwv2, 0);
-        date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-        WSET(req->rq_header, smb_vwv3, date);
-        WSET(req->rq_header, smb_vwv4, time);
-        date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-        WSET(req->rq_header, smb_vwv5, date);
-        WSET(req->rq_header, smb_vwv6, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
-               date, time, fattr->f_mtime);
-#endif
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBsetattrE, 0, 0);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Bugs Noted:
- * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
- * set the file's attribute flags.
- */
-static int
-smb_proc_setattr_trans2(struct smb_sb_info *server,
-                        struct dentry *dir, struct smb_fattr *fattr)
-{
-        __u16 date, time;
-        char *p, *param;
-        int result;
-        char data[26];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, 1);      /* Info level SMB_INFO_STANDARD */
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        WSET(data, 0, 0); /* creation time */
-        WSET(data, 2, 0);
-        date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-        WSET(data, 4, date);
-        WSET(data, 6, time);
-        date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-        WSET(data, 8, date);
-        WSET(data, 10, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n", 
-               DENTRY_PATH(dir), date, time, fattr->f_mtime);
-#endif
-        DSET(data, 12, 0); /* size */
-        DSET(data, 16, 0); /* blksize */
-        WSET(data, 20, 0); /* attr */
-        DSET(data, 22, 0); /* ULONG EA size */
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_ldata = 26;
-        req->rq_data  = data;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-        if (req->rq_rcls != 0)
-                result = smb_errno(req);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * ATTR_MODE      0x001
- * ATTR_UID       0x002
- * ATTR_GID       0x004
- * ATTR_SIZE      0x008
- * ATTR_ATIME     0x010
- * ATTR_MTIME     0x020
- * ATTR_CTIME     0x040
- * ATTR_ATIME_SET 0x080
- * ATTR_MTIME_SET 0x100
- * ATTR_FORCE     0x200 
- * ATTR_ATTR_FLAG 0x400
- *
- * major/minor should only be set by mknod.
- */
-int
-smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
-                      unsigned int major, unsigned int minor)
-{
-        struct smb_sb_info *server = server_from_dentry(d);
-        u64 nttime;
-        char *p, *param;
-        int result;
-        char data[100];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
-        WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        /* 0 L file size in bytes */
-        /* 8 L file size on disk in bytes (block count) */
-        /* 40 L uid */
-        /* 48 L gid */
-        /* 56 W file type enum */
-        /* 60 L devmajor */
-        /* 68 L devminor */
-        /* 76 L unique ID (inode) */
-        /* 84 L permissions */
-        /* 92 L link count */
-        LSET(data, 0, SMB_SIZE_NO_CHANGE);
-        LSET(data, 8, SMB_SIZE_NO_CHANGE);
-        LSET(data, 16, SMB_TIME_NO_CHANGE);
-        LSET(data, 24, SMB_TIME_NO_CHANGE);
-        LSET(data, 32, SMB_TIME_NO_CHANGE);
-        LSET(data, 40, SMB_UID_NO_CHANGE);
-        LSET(data, 48, SMB_GID_NO_CHANGE);
-        DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
-        LSET(data, 60, major);
-        LSET(data, 68, minor);
-        LSET(data, 76, 0);
-        LSET(data, 84, SMB_MODE_NO_CHANGE);
-        LSET(data, 92, 0);
-        if (attr->ia_valid & ATTR_SIZE) {
-                LSET(data, 0, attr->ia_size);
-                LSET(data, 8, 0); /* can't set anyway */
-        }
-        /*
-         * FIXME: check the conversion function it the correct one
-         *
-         * we can't set ctime but we might as well pass this to the server
-         * and let it ignore it.
-         */
-        if (attr->ia_valid & ATTR_CTIME) {
-                nttime = smb_unixutc2ntutc(attr->ia_ctime);
-                LSET(data, 16, nttime);
-        }
-        if (attr->ia_valid & ATTR_ATIME) {
-                nttime = smb_unixutc2ntutc(attr->ia_atime);
-                LSET(data, 24, nttime);
-        }
-        if (attr->ia_valid & ATTR_MTIME) {
-                nttime = smb_unixutc2ntutc(attr->ia_mtime);
-                LSET(data, 32, nttime);
-        }
-        
-        if (attr->ia_valid & ATTR_UID) {
-                LSET(data, 40, attr->ia_uid);
-        }
-        if (attr->ia_valid & ATTR_GID) {
-                LSET(data, 48, attr->ia_gid); 
-        }
-        
-        if (attr->ia_valid & ATTR_MODE) {
-                LSET(data, 84, attr->ia_mode);
-        }
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_ldata = 100;
-        req->rq_data  = data;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Set the modify and access timestamps for a file.
- *
- * Incredibly enough, in all of SMB there is no message to allow
- * setting both attributes and timestamps at once. 
- *
- * Bugs Noted:
- * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message 
- * with info level 1 (INFO_STANDARD).
- * (2) Win 95 seems not to support setting directory timestamps.
- * (3) Under the core protocol apparently the only way to set the
- * timestamp is to open and close the file.
- */
-int
-smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        struct inode *inode = dentry->d_inode;
-        int result;
-        VERBOSE("setting %s/%s, open=%d\n",
-                DENTRY_PATH(dentry), smb_is_open(inode));
-        /* setting the time on a Win95 server fails (tridge) */
-        if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 && 
-            !(server->mnt->flags & SMB_MOUNT_WIN95)) {
-                if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
-                        result = smb_proc_setattr_ext(server, inode, fattr);
-                else
-                        result = smb_proc_setattr_trans2(server, dentry, fattr);
-        } else {
-                /*
-                 * Fail silently on directories ... timestamp can't be set?
-                 */
-                result = 0;
-                if (S_ISREG(inode->i_mode)) {
-                        /*
-                         * Set the mtime by opening and closing the file.
-                         * Note that the file is opened read-only, but this
-                         * still allows us to set the date (tridge)
-                         */
-                        result = -EACCES;
-                        if (!smb_is_open(inode))
-                                smb_proc_open(server, dentry, SMB_O_RDONLY);
-                        if (smb_is_open(inode)) {
-                                inode->i_mtime = fattr->f_mtime;
-                                result = smb_proc_close_inode(server, inode);
-                        }
-                }
-        }
-        return result;
-}
-int
-smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
-{
-        struct smb_sb_info *server = SMB_SB(dentry->d_sb);
-        int result;
-        char *p;
-        long unit;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBdskattr, 0, 0);
-        if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
-                goto out_free;
-        p = SMB_VWV(req->rq_header);
-        unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
-        attr->f_blocks = WVAL(p, 0) * unit;
-        attr->f_bsize  = SMB_ST_BLKSIZE;
-        attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
-                   char *buffer, int len)
-{
-        char *p, *param;
-        int result;
-        struct smb_request *req;
-        DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        req->rq_trans2_command = TRANSACT2_QPATHINFO;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-                &param[6], result, req->rq_rcls, req->rq_err);
-        /* copy data up to the \0 or buffer length */
-        result = len;
-        if (req->rq_ldata < len)
-                result = req->rq_ldata;
-        strncpy(buffer, req->rq_data, result);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Create a symlink object called dentry which points to oldpath.
- * Samba does not permit dangling links but returns a suitable error message.
- */
-int
-smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
-                 const char *oldpath)
-{
-        char *p, *param;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_ldata = strlen(oldpath) + 1;
-        req->rq_data  = (char *) oldpath;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-                &param[6], result, req->rq_rcls, req->rq_err);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Create a hard link object called new_dentry which points to dentry.
- */
-int
-smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
-              struct dentry *new_dentry)
-{
-        char *p, *param;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
-                                 new_dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        /* Grr, pointless separation of parameters and data ... */
-        req->rq_data = p;
-        req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
-                                        dentry, NULL);
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-               &param[6], result, req->rq_rcls, req->rq_err);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server)
-{
-        int result;
-        int major, minor;
-        u64 caps;
-        char param[2];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 100)))
-                goto out;
-        WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
-        req->rq_trans2_command = TRANSACT2_QFSINFO;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = 2;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        if (req->rq_ldata < 12) {
-                PARANOIA("Not enough data\n");
-                goto out_free;
-        }
-        major = WVAL(req->rq_data, 0);
-        minor = WVAL(req->rq_data, 2);
-        DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
-               major, minor);
-        /* FIXME: verify that we are ok with this major/minor? */
-        caps = LVAL(req->rq_data, 4);
-        DEBUG1("Server capabilities 0x%016llx\n", caps);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src)
-{
-        memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
-}
-/* < LANMAN2 */
-static struct smb_ops smb_ops_core =
-{
-        .read           = smb_proc_read,
-        .write          = smb_proc_write,
-        .readdir        = smb_proc_readdir_short,
-        .getattr        = smb_proc_getattr_core,
-        .truncate       = smb_proc_trunc32,
-};
-/* LANMAN2, OS/2, others? */
-static struct smb_ops smb_ops_os2 =
-{
-        .read           = smb_proc_read,
-        .write          = smb_proc_write,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_trans2_std,
-        .truncate       = smb_proc_trunc32,
-};
-/* Win95, and possibly some NetApp versions too */
-static struct smb_ops smb_ops_win95 =
-{
-        .read           = smb_proc_read,    /* does not support 12word readX */
-        .write          = smb_proc_write,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_95,
-        .truncate       = smb_proc_trunc95,
-};
-/* Samba, NT4 and NT5 */
-static struct smb_ops smb_ops_winNT =
-{
-        .read           = smb_proc_readX,
-        .write          = smb_proc_writeX,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_trans2_all,
-        .truncate       = smb_proc_trunc64,
-};
-/* Samba w/ unix extensions. Others? */
-static struct smb_ops smb_ops_unix =
-{
-        .read           = smb_proc_readX,
-        .write          = smb_proc_writeX,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_unix,
-        /* FIXME: core/ext/time setattr needs to be cleaned up! */
-        /* .setattr     = smb_proc_setattr_unix, */
-        .truncate       = smb_proc_trunc64,
-};
-/* Place holder until real ops are in place */
-static struct smb_ops smb_ops_null =
-{
-        .readdir        = smb_proc_readdir_null,
-        .getattr        = smb_proc_getattr_null,
-};
-void smb_install_null_ops(struct smb_ops *ops)
-{
-        install_ops(ops, &smb_ops_null);
-}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
deleted file mode 100644
index 05939a6f43e6..000000000000
--- a/fs/smbfs/proto.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Autogenerated with cproto on:  Sat Sep 13 17:18:51 CEST 2003
- */
-struct smb_request;
-struct sock;
-struct statfs;
-/* proc.c */
-extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
-extern __u32 smb_len(__u8 *p);
-extern int smb_get_rsize(struct smb_sb_info *server);
-extern int smb_get_wsize(struct smb_sb_info *server);
-extern int smb_errno(struct smb_request *req);
-extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
-extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
-extern int smb_open(struct dentry *dentry, int wish);
-extern int smb_close(struct inode *ino);
-extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
-extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
-extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
-extern int smb_proc_mkdir(struct dentry *dentry);
-extern int smb_proc_rmdir(struct dentry *dentry);
-extern int smb_proc_unlink(struct dentry *dentry);
-extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
-extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-                                 struct super_block *sb);
-extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
-extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
-extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
-extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
-extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
-extern void smb_install_null_ops(struct smb_ops *ops);
-/* dir.c */
-extern const struct file_operations smb_dir_operations;
-extern const struct inode_operations smb_dir_inode_operations;
-extern const struct inode_operations smb_dir_inode_operations_unix;
-extern void smb_new_dentry(struct dentry *dentry);
-extern void smb_renew_times(struct dentry *dentry);
-/* cache.c */
-extern void smb_invalid_dir_cache(struct inode *dir);
-extern void smb_invalidate_dircache_entries(struct dentry *parent);
-extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
-extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
-/* sock.c */
-extern void smb_data_ready(struct sock *sk, int len);
-extern int smb_valid_socket(struct inode *inode);
-extern void smb_close_socket(struct smb_sb_info *server);
-extern int smb_recv_available(struct smb_sb_info *server);
-extern int smb_receive_header(struct smb_sb_info *server);
-extern int smb_receive_drop(struct smb_sb_info *server);
-extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
-extern int smb_send_request(struct smb_request *req);
-/* inode.c */
-extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
-extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_invalidate_inodes(struct smb_sb_info *server);
-extern int smb_revalidate_inode(struct dentry *dentry);
-extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
-extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
-/* file.c */
-extern const struct address_space_operations smb_file_aops;
-extern const struct file_operations smb_file_operations;
-extern const struct inode_operations smb_file_inode_operations;
-/* ioctl.c */
-extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-/* smbiod.c */
-extern void smbiod_wake_up(void);
-extern int smbiod_register_server(struct smb_sb_info *server);
-extern void smbiod_unregister_server(struct smb_sb_info *server);
-extern void smbiod_flush(struct smb_sb_info *server);
-extern int smbiod_retry(struct smb_sb_info *server);
-/* request.c */
-extern int smb_init_request_cache(void);
-extern void smb_destroy_request_cache(void);
-extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
-extern void smb_rput(struct smb_request *req);
-extern int smb_add_request(struct smb_request *req);
-extern int smb_request_send_server(struct smb_sb_info *server);
-extern int smb_request_recv(struct smb_sb_info *server);
-/* symlink.c */
-extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
-extern const struct inode_operations smb_link_inode_operations;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
deleted file mode 100644
index 45f45933e862..000000000000
--- a/fs/smbfs/request.c
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- *  request.c
- *
- *  Copyright (C) 2001 by Urban Widmark
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-/* #define SMB_SLAB_DEBUG       (SLAB_RED_ZONE | SLAB_POISON) */
-#define SMB_SLAB_DEBUG  0
-/* cache for request structures */
-static struct kmem_cache *req_cachep;
-static int smb_request_send_req(struct smb_request *req);
-/*
-  /proc/slabinfo:
-  name, active, num, objsize, active_slabs, num_slaps, #pages
-*/
-int smb_init_request_cache(void)
-{
-        req_cachep = kmem_cache_create("smb_request",
-                                       sizeof(struct smb_request), 0,
-                                       SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
-                                       NULL);
-        if (req_cachep == NULL)
-                return -ENOMEM;
-        return 0;
-}
-void smb_destroy_request_cache(void)
-{
-        kmem_cache_destroy(req_cachep);
-}
-/*
- * Allocate and initialise a request structure
- */
-static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
-                                                int bufsize)
-{
-        struct smb_request *req;
-        unsigned char *buf = NULL;
-        req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
-        VERBOSE("allocating request: %p\n", req);
-        if (!req)
-                goto out;
-        if (bufsize > 0) {
-                buf = kmalloc(bufsize, GFP_NOFS);
-                if (!buf) {
-                        kmem_cache_free(req_cachep, req);
-                        return NULL;
-                }
-        }
-        req->rq_buffer = buf;
-        req->rq_bufsize = bufsize;
-        req->rq_server = server;
-        init_waitqueue_head(&req->rq_wait);
-        INIT_LIST_HEAD(&req->rq_queue);
-        atomic_set(&req->rq_count, 1);
-out:
-        return req;
-}
-struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
-{
-        struct smb_request *req = NULL;
-        for (;;) {
-                atomic_inc(&server->nr_requests);
-                if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
-                        req = smb_do_alloc_request(server, bufsize);
-                        if (req != NULL)
-                                break;
-                }
-#if 0
-                /*
-                 * Try to free up at least one request in order to stay
-                 * below the hard limit
-                 */
-                if (nfs_try_to_free_pages(server))
-                        continue;
-                if (fatal_signal_pending(current))
-                        return ERR_PTR(-ERESTARTSYS);
-                current->policy = SCHED_YIELD;
-                schedule();
-#else
-                /* FIXME: we want something like nfs does above, but that
-                   requires changes to all callers and can wait. */
-                break;
-#endif
-        }
-        return req;
-}
-static void smb_free_request(struct smb_request *req)
-{
-        atomic_dec(&req->rq_server->nr_requests);
-        if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
-                kfree(req->rq_buffer);
-        kfree(req->rq_trans2buffer);
-        kmem_cache_free(req_cachep, req);
-}
-/*
- * What prevents a rget to race with a rput? The count must never drop to zero
- * while it is in use. Only rput if it is ok that it is free'd.
- */
-static void smb_rget(struct smb_request *req)
-{
-        atomic_inc(&req->rq_count);
-}
-void smb_rput(struct smb_request *req)
-{
-        if (atomic_dec_and_test(&req->rq_count)) {
-                list_del_init(&req->rq_queue);
-                smb_free_request(req);
-        }
-}
-/* setup to receive the data part of the SMB */
-static int smb_setup_bcc(struct smb_request *req)
-{
-        int result = 0;
-        req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-        if (req->rq_rlen > req->rq_bufsize) {
-                PARANOIA("Packet too large %d > %d\n",
-                         req->rq_rlen, req->rq_bufsize);
-                return -ENOBUFS;
-        }
-        req->rq_iov[0].iov_base = req->rq_buffer;
-        req->rq_iov[0].iov_len  = req->rq_rlen;
-        req->rq_iovlen = 1;
-        return result;
-}
-/*
- * Prepare a "normal" request structure.
- */
-static int smb_setup_request(struct smb_request *req)
-{
-        int len = smb_len(req->rq_header) + 4;
-        req->rq_slen = len;
-        /* if we expect a data part in the reply we set the iov's to read it */
-        if (req->rq_resp_bcc)
-                req->rq_setup_read = smb_setup_bcc;
-        /* This tries to support re-using the same request */
-        req->rq_bytes_sent = 0;
-        req->rq_rcls = 0;
-        req->rq_err = 0;
-        req->rq_errno = 0;
-        req->rq_fragment = 0;
-        kfree(req->rq_trans2buffer);
-        req->rq_trans2buffer = NULL;
-        return 0;
-}
-/*
- * Prepare a transaction2 request structure
- */
-static int smb_setup_trans2request(struct smb_request *req)
-{
-        struct smb_sb_info *server = req->rq_server;
-        int mparam, mdata;
-        static unsigned char padding[4];
-        /* I know the following is very ugly, but I want to build the
-           smb packet as efficiently as possible. */
-        const int smb_parameters = 15;
-        const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
-        const int oparam = ALIGN(header + 3, sizeof(u32));
-        const int odata  = ALIGN(oparam + req->rq_lparm, sizeof(u32));
-        const int bcc = (req->rq_data ? odata + req->rq_ldata :
-                                        oparam + req->rq_lparm) - header;
-        if ((bcc + oparam) > server->opt.max_xmit)
-                return -ENOMEM;
-        smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
-        /*
-         * max parameters + max data + max setup == bufsize to make NT4 happy
-         * and not abort the transfer or split into multiple responses. It also
-         * makes smbfs happy as handling packets larger than the buffer size
-         * is extra work.
-         *
-         * OS/2 is probably going to hate me for this ...
-         */
-        mparam = SMB_TRANS2_MAX_PARAM;
-        mdata = req->rq_bufsize - mparam;
-        mdata = server->opt.max_xmit - mparam - 100;
-        if (mdata < 1024) {
-                mdata = 1024;
-                mparam = 20;
-        }
-#if 0
-        /* NT/win2k has ~4k max_xmit, so with this we request more than it wants
-           to return as one SMB. Useful for testing the fragmented trans2
-           handling. */
-        mdata = 8192;
-#endif
-        WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
-        WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
-        WSET(req->rq_header, smb_mprcnt, mparam);
-        WSET(req->rq_header, smb_mdrcnt, mdata);
-        WSET(req->rq_header, smb_msrcnt, 0);    /* max setup always 0 ? */
-        WSET(req->rq_header, smb_flags, 0);
-        DSET(req->rq_header, smb_timeout, 0);
-        WSET(req->rq_header, smb_pscnt, req->rq_lparm);
-        WSET(req->rq_header, smb_psoff, oparam - 4);
-        WSET(req->rq_header, smb_dscnt, req->rq_ldata);
-        WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
-        *(req->rq_header + smb_suwcnt) = 0x01;          /* setup count */
-        *(req->rq_header + smb_suwcnt + 1) = 0x00;      /* reserved */
-        WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
-        req->rq_iovlen = 2;
-        req->rq_iov[0].iov_base = (void *) req->rq_header;
-        req->rq_iov[0].iov_len = oparam;
-        req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
-        req->rq_iov[1].iov_len = req->rq_lparm;
-        req->rq_slen = oparam + req->rq_lparm;
-        if (req->rq_data) {
-                req->rq_iovlen += 2;
-                req->rq_iov[2].iov_base = padding;
-                req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
-                req->rq_iov[3].iov_base = req->rq_data;
-                req->rq_iov[3].iov_len = req->rq_ldata;
-                req->rq_slen = odata + req->rq_ldata;
-        }
-        /* always a data part for trans2 replies */
-        req->rq_setup_read = smb_setup_bcc;
-        return 0;
-}
-/*
- * Add a request and tell smbiod to process it
- */
-int smb_add_request(struct smb_request *req)
-{
-        long timeleft;
-        struct smb_sb_info *server = req->rq_server;
-        int result = 0;
-        smb_setup_request(req);
-        if (req->rq_trans2_command) {
-                if (req->rq_buffer == NULL) {
-                        PARANOIA("trans2 attempted without response buffer!\n");
-                        return -EIO;
-                }
-                result = smb_setup_trans2request(req);
-        }
-        if (result < 0)
-                return result;
-#ifdef SMB_DEBUG_PACKET_SIZE
-        add_xmit_stats(req);
-#endif
-        /* add 'req' to the queue of requests */
-        if (smb_lock_server_interruptible(server))
-                return -EINTR;
-        /*
-         * Try to send the request as the process. If that fails we queue the
-         * request and let smbiod send it later.
-         */
-        /* FIXME: each server has a number on the maximum number of parallel
-           requests. 10, 50 or so. We should not allow more requests to be
-           active. */
-        if (server->mid > 0xf000)
-                server->mid = 0;
-        req->rq_mid = server->mid++;
-        WSET(req->rq_header, smb_mid, req->rq_mid);
-        result = 0;
-        if (server->state == CONN_VALID) {
-                if (list_empty(&server->xmitq))
-                        result = smb_request_send_req(req);
-                if (result < 0) {
-                        /* Connection lost? */
-                        server->conn_error = result;
-                        server->state = CONN_INVALID;
-                }
-        }
-        if (result != 1)
-                list_add_tail(&req->rq_queue, &server->xmitq);
-        smb_rget(req);
-        if (server->state != CONN_VALID)
-                smbiod_retry(server);
-        smb_unlock_server(server);
-        smbiod_wake_up();
-        timeleft = wait_event_interruptible_timeout(req->rq_wait,
-                                    req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
-        if (!timeleft || signal_pending(current)) {
-                /*
-                 * On timeout or on interrupt we want to try and remove the
-                 * request from the recvq/xmitq.
-                 * First check if the request is still part of a queue. (May
-                 * have been removed by some error condition)
-                 */
-                smb_lock_server(server);
-                if (!list_empty(&req->rq_queue)) {
-                        list_del_init(&req->rq_queue);
-                        smb_rput(req);
-                }
-                smb_unlock_server(server);
-        }
-        if (!timeleft) {
-                PARANOIA("request [%p, mid=%d] timed out!\n",
-                         req, req->rq_mid);
-                VERBOSE("smb_com:  %02x\n", *(req->rq_header + smb_com));
-                VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
-                VERBOSE("smb_flg:  %02x\n", *(req->rq_header + smb_flg));
-                VERBOSE("smb_tid:  %04x\n", WVAL(req->rq_header, smb_tid));
-                VERBOSE("smb_pid:  %04x\n", WVAL(req->rq_header, smb_pid));
-                VERBOSE("smb_uid:  %04x\n", WVAL(req->rq_header, smb_uid));
-                VERBOSE("smb_mid:  %04x\n", WVAL(req->rq_header, smb_mid));
-                VERBOSE("smb_wct:  %02x\n", *(req->rq_header + smb_wct));
-                req->rq_rcls = ERRSRV;
-                req->rq_err  = ERRtimeout;
-                /* Just in case it was "stuck" */
-                smbiod_wake_up();
-        }
-        VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
-        if (req->rq_rcls != 0)
-                req->rq_errno = smb_errno(req);
-        if (signal_pending(current))
-                req->rq_errno = -ERESTARTSYS;
-        return req->rq_errno;
-}
-/*
- * Send a request and place it on the recvq if successfully sent.
- * Must be called with the server lock held.
- */
-static int smb_request_send_req(struct smb_request *req)
-{
-        struct smb_sb_info *server = req->rq_server;
-        int result;
-        if (req->rq_bytes_sent == 0) {
-                WSET(req->rq_header, smb_tid, server->opt.tid);
-                WSET(req->rq_header, smb_pid, 1);
-                WSET(req->rq_header, smb_uid, server->opt.server_uid);
-        }
-        result = smb_send_request(req);
-        if (result < 0 && result != -EAGAIN)
-                goto out;
-        result = 0;
-        if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
-                goto out;
-        list_move_tail(&req->rq_queue, &server->recvq);
-        result = 1;
-out:
-        return result;
-}
-/*
- * Sends one request for this server. (smbiod)
- * Must be called with the server lock held.
- * Returns: <0 on error
- *           0 if no request could be completely sent
- *           1 if all data for one request was sent
- */
-int smb_request_send_server(struct smb_sb_info *server)
-{
-        struct list_head *head;
-        struct smb_request *req;
-        int result;
-        if (server->state != CONN_VALID)
-                return 0;
-        /* dequeue first request, if any */
-        req = NULL;
-        head = server->xmitq.next;
-        if (head != &server->xmitq) {
-                req = list_entry(head, struct smb_request, rq_queue);
-        }
-        if (!req)
-                return 0;
-        result = smb_request_send_req(req);
-        if (result < 0) {
-                server->conn_error = result;
-                list_move(&req->rq_queue, &server->xmitq);
-                result = -EIO;
-                goto out;
-        }
-out:
-        return result;
-}
-/*
- * Try to find a request matching this "mid". Typically the first entry will
- * be the matching one.
- */
-static struct smb_request *find_request(struct smb_sb_info *server, int mid)
-{
-        struct list_head *tmp;
-        struct smb_request *req = NULL;
-        list_for_each(tmp, &server->recvq) {
-                req = list_entry(tmp, struct smb_request, rq_queue);
-                if (req->rq_mid == mid) {
-                        break;
-                }
-                req = NULL;
-        }
-        if (!req) {
-                VERBOSE("received reply with mid %d but no request!\n",
-                        WVAL(server->header, smb_mid));
-                server->rstate = SMB_RECV_DROP;
-        }
-        return req;
-}
-/*
- * Called when we have read the smb header and believe this is a response.
- */
-static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
-{
-        int hdrlen, wct;
-        memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
-        wct = *(req->rq_header + smb_wct);
-        if (wct > 20) { 
-                PARANOIA("wct too large, %d > 20\n", wct);
-                server->rstate = SMB_RECV_DROP;
-                return 0;
-        }
-        req->rq_resp_wct = wct;
-        hdrlen = SMB_HEADER_LEN + wct*2 + 2;
-        VERBOSE("header length: %d   smb_wct: %2d\n", hdrlen, wct);
-        req->rq_bytes_recvd = SMB_HEADER_LEN;
-        req->rq_rlen = hdrlen;
-        req->rq_iov[0].iov_base = req->rq_header;
-        req->rq_iov[0].iov_len  = hdrlen;
-        req->rq_iovlen = 1;
-        server->rstate = SMB_RECV_PARAM;
-#ifdef SMB_DEBUG_PACKET_SIZE
-        add_recv_stats(smb_len(server->header));
-#endif
-        return 0;
-}
-/*
- * Reads the SMB parameters
- */
-static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
-{
-        int result;
-        result = smb_receive(server, req);
-        if (result < 0)
-                return result;
-        if (req->rq_bytes_recvd < req->rq_rlen)
-                return 0;
-        VERBOSE("result: %d   smb_bcc:  %04x\n", result,
-                WVAL(req->rq_header, SMB_HEADER_LEN +
-                     (*(req->rq_header + smb_wct) * 2)));
-        result = 0;
-        req->rq_iov[0].iov_base = NULL;
-        req->rq_rlen = 0;
-        if (req->rq_callback)
-                req->rq_callback(req);
-        else if (req->rq_setup_read)
-                result = req->rq_setup_read(req);
-        if (result < 0) {
-                server->rstate = SMB_RECV_DROP;
-                return result;
-        }
-        server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
-        req->rq_bytes_recvd = 0;        // recvd out of the iov
-        VERBOSE("rlen: %d\n", req->rq_rlen);
-        if (req->rq_rlen < 0) {
-                PARANOIA("Parameters read beyond end of packet!\n");
-                server->rstate = SMB_RECV_END;
-                return -EIO;
-        }
-        return 0;
-}
-/*
- * Reads the SMB data
- */
-static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
-{
-        int result;
-        result = smb_receive(server, req);
-        if (result < 0)
-                goto out;
-        if (req->rq_bytes_recvd < req->rq_rlen)
-                goto out;
-        server->rstate = SMB_RECV_END;
-out:
-        VERBOSE("result: %d\n", result);
-        return result;
-}
-/*
- * Receive a transaction2 response
- * Return: 0 if the response has been fully read
- *         1 if there are further "fragments" to read
- *        <0 if there is an error
- */
-static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
-{
-        unsigned char *inbuf;
-        unsigned int parm_disp, parm_offset, parm_count, parm_tot;
-        unsigned int data_disp, data_offset, data_count, data_tot;
-        int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-        VERBOSE("handling trans2\n");
-        inbuf = req->rq_header;
-        data_tot    = WVAL(inbuf, smb_tdrcnt);
-        parm_tot    = WVAL(inbuf, smb_tprcnt);
-        parm_disp   = WVAL(inbuf, smb_prdisp);
-        parm_offset = WVAL(inbuf, smb_proff);
-        parm_count  = WVAL(inbuf, smb_prcnt);
-        data_disp   = WVAL(inbuf, smb_drdisp);
-        data_offset = WVAL(inbuf, smb_droff);
-        data_count  = WVAL(inbuf, smb_drcnt);
-        /* Modify offset for the split header/buffer we use */
-        if (data_count || data_offset) {
-                if (unlikely(data_offset < hdrlen))
-                        goto out_bad_data;
-                else
-                        data_offset -= hdrlen;
-        }
-        if (parm_count || parm_offset) {
-                if (unlikely(parm_offset < hdrlen))
-                        goto out_bad_parm;
-                else
-                        parm_offset -= hdrlen;
-        }
-        if (parm_count == parm_tot && data_count == data_tot) {
-                /*
-                 * This packet has all the trans2 data.
-                 *
-                 * We setup the request so that this will be the common
-                 * case. It may be a server error to not return a
-                 * response that fits.
-                 */
-                VERBOSE("single trans2 response  "
-                        "dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-                        data_count, parm_count,
-                        data_offset, parm_offset);
-                req->rq_ldata = data_count;
-                req->rq_lparm = parm_count;
-                req->rq_data = req->rq_buffer + data_offset;
-                req->rq_parm = req->rq_buffer + parm_offset;
-                if (unlikely(parm_offset + parm_count > req->rq_rlen))
-                        goto out_bad_parm;
-                if (unlikely(data_offset + data_count > req->rq_rlen))
-                        goto out_bad_data;
-                return 0;
-        }
-        VERBOSE("multi trans2 response  "
-                "frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-                req->rq_fragment,
-                data_count, parm_count,
-                data_offset, parm_offset);
-        if (!req->rq_fragment) {
-                int buf_len;
-                /* We got the first trans2 fragment */
-                req->rq_fragment = 1;
-                req->rq_total_data = data_tot;
-                req->rq_total_parm = parm_tot;
-                req->rq_ldata = 0;
-                req->rq_lparm = 0;
-                buf_len = data_tot + parm_tot;
-                if (buf_len > SMB_MAX_PACKET_SIZE)
-                        goto out_too_long;
-                req->rq_trans2bufsize = buf_len;
-                req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
-                if (!req->rq_trans2buffer)
-                        goto out_no_mem;
-                req->rq_parm = req->rq_trans2buffer;
-                req->rq_data = req->rq_trans2buffer + parm_tot;
-        } else if (unlikely(req->rq_total_data < data_tot ||
-                            req->rq_total_parm < parm_tot))
-                goto out_data_grew;
-        if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
-                     parm_offset + parm_count > req->rq_rlen))
-                goto out_bad_parm;
-        if (unlikely(data_disp + data_count > req->rq_total_data ||
-                     data_offset + data_count > req->rq_rlen))
-                goto out_bad_data;
-        inbuf = req->rq_buffer;
-        memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
-        memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
-        req->rq_ldata += data_count;
-        req->rq_lparm += parm_count;
-        /*
-         * Check whether we've received all of the data. Note that
-         * we use the packet totals -- total lengths might shrink!
-         */
-        if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
-                req->rq_ldata = data_tot;
-                req->rq_lparm = parm_tot;
-                return 0;
-        }
-        return 1;
-out_too_long:
-        printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
-                data_tot, parm_tot);
-        goto out_EIO;
-out_no_mem:
-        printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
-               req->rq_trans2bufsize);
-        req->rq_errno = -ENOMEM;
-        goto out;
-out_data_grew:
-        printk(KERN_ERR "smb_trans2: data/params grew!\n");
-        goto out_EIO;
-out_bad_parm:
-        printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-               parm_disp, parm_count, parm_tot, parm_offset);
-        goto out_EIO;
-out_bad_data:
-        printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-               data_disp, data_count, data_tot, data_offset);
-out_EIO:
-        req->rq_errno = -EIO;
-out:
-        return req->rq_errno;
-}
-/*
- * State machine for receiving responses. We handle the fact that we can't
- * read the full response in one try by having states telling us how much we
- * have read.
- *
- * Must be called with the server lock held (only called from smbiod).
- *
- * Return: <0 on error
- */
-int smb_request_recv(struct smb_sb_info *server)
-{
-        struct smb_request *req = NULL;
-        int result = 0;
-        if (smb_recv_available(server) <= 0)
-                return 0;
-        VERBOSE("state: %d\n", server->rstate);
-        switch (server->rstate) {
-        case SMB_RECV_DROP:
-                result = smb_receive_drop(server);
-                if (result < 0)
-                        break;
-                if (server->rstate == SMB_RECV_DROP)
-                        break;
-                server->rstate = SMB_RECV_START;
-                /* fallthrough */
-        case SMB_RECV_START:
-                server->smb_read = 0;
-                server->rstate = SMB_RECV_HEADER;
-                /* fallthrough */
-        case SMB_RECV_HEADER:
-                result = smb_receive_header(server);
-                if (result < 0)
-                        break;
-                if (server->rstate == SMB_RECV_HEADER)
-                        break;
-                if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
-                        server->rstate = SMB_RECV_REQUEST;
-                        break;
-                }
-                if (server->rstate != SMB_RECV_HCOMPLETE)
-                        break;
-                /* fallthrough */
-        case SMB_RECV_HCOMPLETE:
-                req = find_request(server, WVAL(server->header, smb_mid));
-                if (!req)
-                        break;
-                smb_init_request(server, req);
-                req->rq_rcls = *(req->rq_header + smb_rcls);
-                req->rq_err  = WVAL(req->rq_header, smb_err);
-                if (server->rstate != SMB_RECV_PARAM)
-                        break;
-                /* fallthrough */
-        case SMB_RECV_PARAM:
-                if (!req)
-                        req = find_request(server,WVAL(server->header,smb_mid));
-                if (!req)
-                        break;
-                result = smb_recv_param(server, req);
-                if (result < 0)
-                        break;
-                if (server->rstate != SMB_RECV_DATA)
-                        break;
-                /* fallthrough */
-        case SMB_RECV_DATA:
-                if (!req)
-                        req = find_request(server,WVAL(server->header,smb_mid));
-                if (!req)
-                        break;
-                result = smb_recv_data(server, req);
-                if (result < 0)
-                        break;
-                break;
-                /* We should never be called with any of these states */
-        case SMB_RECV_END:
-        case SMB_RECV_REQUEST:
-                BUG();
-        }
-        if (result < 0) {
-                /* We saw an error */
-                return result;
-        }
-        if (server->rstate != SMB_RECV_END)
-                return 0;
-        result = 0;
-        if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
-                result = smb_recv_trans2(server, req);
-        /*
-         * Response completely read. Drop any extra bytes sent by the server.
-         * (Yes, servers sometimes add extra bytes to responses)
-         */
-        VERBOSE("smb_len: %d   smb_read: %d\n",
-                server->smb_len, server->smb_read);
-        if (server->smb_read < server->smb_len)
-                smb_receive_drop(server);
-        server->rstate = SMB_RECV_START;
-        if (!result) {
-                list_del_init(&req->rq_queue);
-                req->rq_flags |= SMB_REQ_RECEIVED;
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-        return 0;
-}
diff --git a/fs/smbfs/request.h b/fs/smbfs/request.h
deleted file mode 100644
index efb21451e7c9..000000000000
--- a/fs/smbfs/request.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-#include <linux/wait.h>
-struct smb_request {
-        struct list_head rq_queue;      /* recvq or xmitq for the server */
-        atomic_t rq_count;
-        wait_queue_head_t rq_wait;
-        int rq_flags;
-        int rq_mid;     /* multiplex ID, set by request.c */
-        struct smb_sb_info *rq_server;
-        /* header + word count + parameter words + byte count */
-        unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
-        int rq_bufsize;
-        unsigned char *rq_buffer;
-        /* FIXME: this is not good enough for merging IO requests. */
-        unsigned char *rq_page;
-        int rq_rsize;
-        int rq_resp_wct;
-        int rq_resp_bcc;
-        int rq_rlen;
-        int rq_bytes_recvd;
-        int rq_slen;
-        int rq_bytes_sent;
-        int rq_iovlen;
-        struct kvec rq_iov[4];
-        int (*rq_setup_read) (struct smb_request *);
-        void (*rq_callback) (struct smb_request *);
-        /* ------ trans2 stuff ------ */
-        u16 rq_trans2_command;  /* 0 if not a trans2 request */
-        unsigned int rq_ldata;
-        unsigned char *rq_data;
-        unsigned int rq_lparm;
-        unsigned char *rq_parm;
-        int rq_fragment;
-        u32 rq_total_data;
-        u32 rq_total_parm;
-        int rq_trans2bufsize;
-        unsigned char *rq_trans2buffer;
-        /* ------ response ------ */
-        unsigned short rq_rcls;
-        unsigned short rq_err;
-        int rq_errno;
-};
-#define SMB_REQ_STATIC          0x0001  /* rq_buffer is static */
-#define SMB_REQ_NORETRY         0x0002  /* request is invalid after retry */
-#define SMB_REQ_TRANSMITTED     0x4000  /* all data has been sent */
-#define SMB_REQ_RECEIVED        0x8000  /* reply received, smbiod is done */
-#define xSMB_REQ_NOREPLY        0x0004  /* we don't want the reply (if any) */
-#define xSMB_REQ_NORECEIVER     0x0008  /* caller doesn't wait for response */
diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h
deleted file mode 100644
index fc4b1a5dd755..000000000000
--- a/fs/smbfs/smb_debug.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Defines some debug macros for smbfs.
- */
-/* This makes a dentry parent/child name pair. Useful for debugging printk's */
-#define DENTRY_PATH(dentry) \
-        (dentry)->d_parent->d_name.name,(dentry)->d_name.name
-/*
- * safety checks that should never happen ???
- * these are normally enabled.
- */
-#ifdef SMBFS_PARANOIA
-# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
-#else
-# define PARANOIA(f, a...) do { ; } while(0)
-#endif
-/* lots of debug messages */
-#ifdef SMBFS_DEBUG_VERBOSE
-# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-# define VERBOSE(f, a...) do { ; } while(0)
-#endif
-/*
- * "normal" debug messages, but not with a normal DEBUG define ... way
- * too common name.
- */
-#ifdef SMBFS_DEBUG
-#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-#define DEBUG1(f, a...) do { ; } while(0)
-#endif
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
deleted file mode 100644
index 0e39a924f10a..000000000000
--- a/fs/smbfs/smbiod.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- *  smbiod.c
- *
- *  Copyright (C) 2000, Charles Loep / Corel Corp.
- *  Copyright (C) 2001, Urban Widmark
- */
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/kthread.h>
-#include <net/ip.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-enum smbiod_state {
-        SMBIOD_DEAD,
-        SMBIOD_STARTING,
-        SMBIOD_RUNNING,
-};
-static enum smbiod_state smbiod_state = SMBIOD_DEAD;
-static struct task_struct *smbiod_thread;
-static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
-static LIST_HEAD(smb_servers);
-static DEFINE_SPINLOCK(servers_lock);
-#define SMBIOD_DATA_READY       (1<<0)
-static unsigned long smbiod_flags;
-static int smbiod(void *);
-static int smbiod_start(void);
-/*
- * called when there's work for us to do
- */
-void smbiod_wake_up(void)
-{
-        if (smbiod_state == SMBIOD_DEAD)
-                return;
-        set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-        wake_up_interruptible(&smbiod_wait);
-}
-/*
- * start smbiod if none is running
- */
-static int smbiod_start(void)
-{
-        struct task_struct *tsk;
-        int err = 0;
-        if (smbiod_state != SMBIOD_DEAD)
-                return 0;
-        smbiod_state = SMBIOD_STARTING;
-        __module_get(THIS_MODULE);
-        spin_unlock(&servers_lock);
-        tsk = kthread_run(smbiod, NULL, "smbiod");
-        if (IS_ERR(tsk)) {
-                err = PTR_ERR(tsk);
-                module_put(THIS_MODULE);
-        }
-        spin_lock(&servers_lock);
-        if (err < 0) {
-                smbiod_state = SMBIOD_DEAD;
-                smbiod_thread = NULL;
-        } else {
-                smbiod_state = SMBIOD_RUNNING;
-                smbiod_thread = tsk;
-        }
-        return err;
-}
-/*
- * register a server & start smbiod if necessary
- */
-int smbiod_register_server(struct smb_sb_info *server)
-{
-        int ret;
-        spin_lock(&servers_lock);
-        list_add(&server->entry, &smb_servers);
-        VERBOSE("%p\n", server);
-        ret = smbiod_start();
-        spin_unlock(&servers_lock);
-        return ret;
-}
-/*
- * Unregister a server
- * Must be called with the server lock held.
- */
-void smbiod_unregister_server(struct smb_sb_info *server)
-{
-        spin_lock(&servers_lock);
-        list_del_init(&server->entry);
-        VERBOSE("%p\n", server);
-        spin_unlock(&servers_lock);
-        smbiod_wake_up();
-        smbiod_flush(server);
-}
-void smbiod_flush(struct smb_sb_info *server)
-{
-        struct list_head *tmp, *n;
-        struct smb_request *req;
-        list_for_each_safe(tmp, n, &server->xmitq) {
-                req = list_entry(tmp, struct smb_request, rq_queue);
-                req->rq_errno = -EIO;
-                list_del_init(&req->rq_queue);
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-        list_for_each_safe(tmp, n, &server->recvq) {
-                req = list_entry(tmp, struct smb_request, rq_queue);
-                req->rq_errno = -EIO;
-                list_del_init(&req->rq_queue);
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-}
-/*
- * Wake up smbmount and make it reconnect to the server.
- * This must be called with the server locked.
- *
- * FIXME: add smbconnect version to this
- */
-int smbiod_retry(struct smb_sb_info *server)
-{
-        struct list_head *head;
-        struct smb_request *req;
-        struct pid *pid = get_pid(server->conn_pid);
-        int result = 0;
-        VERBOSE("state: %d\n", server->state);
-        if (server->state == CONN_VALID || server->state == CONN_RETRYING)
-                goto out;
-        smb_invalidate_inodes(server);
-        /*
-         * Some requests are meaningless after a retry, so we abort them.
-         * One example are all requests using 'fileid' since the files are
-         * closed on retry.
-         */
-        head = server->xmitq.next;
-        while (head != &server->xmitq) {
-                req = list_entry(head, struct smb_request, rq_queue);
-                head = head->next;
-                req->rq_bytes_sent = 0;
-                if (req->rq_flags & SMB_REQ_NORETRY) {
-                        VERBOSE("aborting request %p on xmitq\n", req);
-                        req->rq_errno = -EIO;
-                        list_del_init(&req->rq_queue);
-                        smb_rput(req);
-                        wake_up_interruptible(&req->rq_wait);
-                }
-        }
-        /*
-         * FIXME: test the code for retrying request we already sent
-         */
-        head = server->recvq.next;
-        while (head != &server->recvq) {
-                req = list_entry(head, struct smb_request, rq_queue);
-                head = head->next;
-#if 0
-                if (req->rq_flags & SMB_REQ_RETRY) {
-                        /* must move the request to the xmitq */
-                        VERBOSE("retrying request %p on recvq\n", req);
-                        list_move(&req->rq_queue, &server->xmitq);
-                        continue;
-                }
-#endif
-                VERBOSE("aborting request %p on recvq\n", req);
-                /* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
-                req->rq_errno = -EIO;
-                list_del_init(&req->rq_queue);
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-        smb_close_socket(server);
-        if (!pid) {
-                /* FIXME: this is fatal, umount? */
-                printk(KERN_ERR "smb_retry: no connection process\n");
-                server->state = CONN_RETRIED;
-                goto out;
-        }
-        /*
-         * Change state so that only one retry per server will be started.
-         */
-        server->state = CONN_RETRYING;
-        /*
-         * Note: use the "priv" flag, as a user process may need to reconnect.
-         */
-        result = kill_pid(pid, SIGUSR1, 1);
-        if (result) {
-                /* FIXME: this is most likely fatal, umount? */
-                printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
-                goto out;
-        }
-        VERBOSE("signalled pid %d\n", pid_nr(pid));
-        /* FIXME: The retried requests should perhaps get a "time boost". */
-out:
-        put_pid(pid);
-        return result;
-}
-/*
- * Currently handles lockingX packets.
- */
-static void smbiod_handle_request(struct smb_sb_info *server)
-{
-        PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
-        server->rstate = SMB_RECV_DROP;
-}
-/*
- * Do some IO for one server.
- */
-static void smbiod_doio(struct smb_sb_info *server)
-{
-        int result;
-        int maxwork = 7;
-        if (server->state != CONN_VALID)
-                goto out;
-        do {
-                result = smb_request_recv(server);
-                if (result < 0) {
-                        server->state = CONN_INVALID;
-                        smbiod_retry(server);
-                        goto out;       /* reconnecting is slow */
-                } else if (server->rstate == SMB_RECV_REQUEST)
-                        smbiod_handle_request(server);
-        } while (result > 0 && maxwork-- > 0);
-        /*
-         * If there is more to read then we want to be sure to wake up again.
-         */
-        if (server->state != CONN_VALID)
-                goto out;
-        if (smb_recv_available(server) > 0)
-                set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-        do {
-                result = smb_request_send_server(server);
-                if (result < 0) {
-                        server->state = CONN_INVALID;
-                        smbiod_retry(server);
-                        goto out;       /* reconnecting is slow */
-                }
-        } while (result > 0);
-        /*
-         * If the last request was not sent out we want to wake up again.
-         */
-        if (!list_empty(&server->xmitq))
-                set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-out:
-        return;
-}
-/*
- * smbiod kernel thread
- */
-static int smbiod(void *unused)
-{
-        VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
-        for (;;) {
-                struct smb_sb_info *server;
-                struct list_head *pos, *n;
-                /* FIXME: Use poll? */
-                wait_event_interruptible(smbiod_wait,
-                         test_bit(SMBIOD_DATA_READY, &smbiod_flags));
-                if (signal_pending(current)) {
-                        spin_lock(&servers_lock);
-                        smbiod_state = SMBIOD_DEAD;
-                        spin_unlock(&servers_lock);
-                        break;
-                }
-                clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
-                spin_lock(&servers_lock);
-                if (list_empty(&smb_servers)) {
-                        smbiod_state = SMBIOD_DEAD;
-                        spin_unlock(&servers_lock);
-                        break;
-                }
-                list_for_each_safe(pos, n, &smb_servers) {
-                        server = list_entry(pos, struct smb_sb_info, entry);
-                        VERBOSE("checking server %p\n", server);
-                        if (server->state == CONN_VALID) {
-                                spin_unlock(&servers_lock);
-                                smb_lock_server(server);
-                                smbiod_doio(server);
-                                smb_unlock_server(server);
-                                spin_lock(&servers_lock);
-                        }
-                }
-                spin_unlock(&servers_lock);
-        }
-        VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
-        module_put_and_exit(0);
-}
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
deleted file mode 100644
index e37fe4deebd0..000000000000
--- a/fs/smbfs/sock.c
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- *  sock.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/socket.h>
-#include <linux/fcntl.h>
-#include <linux/file.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <linux/mm.h>
-#include <linux/netdevice.h>
-#include <linux/workqueue.h>
-#include <net/scm.h>
-#include <net/tcp_states.h>
-#include <net/ip.h>
-#include <linux/smb_fs.h>
-#include <linux/smb.h>
-#include <linux/smbno.h>
-#include <asm/uaccess.h>
-#include <asm/ioctls.h>
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-static int
-_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
-{
-        struct kvec iov = {ubuf, size};
-        struct msghdr msg = {.msg_flags = flags};
-        msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
-        return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
-}
-/*
- * Return the server this socket belongs to
- */
-static struct smb_sb_info *
-server_from_socket(struct socket *socket)
-{
-        return socket->sk->sk_user_data;
-}
-/*
- * Called when there is data on the socket.
- */
-void
-smb_data_ready(struct sock *sk, int len)
-{
-        struct smb_sb_info *server = server_from_socket(sk->sk_socket);
-        void (*data_ready)(struct sock *, int) = server->data_ready;
-        data_ready(sk, len);
-        VERBOSE("(%p, %d)\n", sk, len);
-        smbiod_wake_up();
-}
-int
-smb_valid_socket(struct inode * inode)
-{
-        return (inode && S_ISSOCK(inode->i_mode) && 
-                SOCKET_I(inode)->type == SOCK_STREAM);
-}
-static struct socket *
-server_sock(struct smb_sb_info *server)
-{
-        struct file *file;
-        if (server && (file = server->sock_file))
-        {
-#ifdef SMBFS_PARANOIA
-                if (!smb_valid_socket(file->f_path.dentry->d_inode))
-                        PARANOIA("bad socket!\n");
-#endif
-                return SOCKET_I(file->f_path.dentry->d_inode);
-        }
-        return NULL;
-}
-void
-smb_close_socket(struct smb_sb_info *server)
-{
-        struct file * file = server->sock_file;
-        if (file) {
-                struct socket *sock = server_sock(server);
-                VERBOSE("closing socket %p\n", sock);
-                sock->sk->sk_data_ready = server->data_ready;
-                server->sock_file = NULL;
-                fput(file);
-        }
-}
-static int
-smb_get_length(struct socket *socket, unsigned char *header)
-{
-        int result;
-        result = _recvfrom(socket, header, 4, MSG_PEEK);
-        if (result == -EAGAIN)
-                return -ENODATA;
-        if (result < 0) {
-                PARANOIA("recv error = %d\n", -result);
-                return result;
-        }
-        if (result < 4)
-                return -ENODATA;
-        switch (header[0]) {
-        case 0x00:
-        case 0x82:
-                break;
-        case 0x85:
-                DEBUG1("Got SESSION KEEP ALIVE\n");
-                _recvfrom(socket, header, 4, 0);        /* read away */
-                return -ENODATA;
-        default:
-                PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
-                return -EIO;
-        }
-        /* The length in the RFC NB header is the raw data length */
-        return smb_len(header);
-}
-int
-smb_recv_available(struct smb_sb_info *server)
-{
-        mm_segment_t oldfs;
-        int avail, err;
-        struct socket *sock = server_sock(server);
-        oldfs = get_fs();
-        set_fs(get_ds());
-        err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
-        set_fs(oldfs);
-        return (err >= 0) ? avail : err;
-}
-/*
- * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
- */
-static int
-smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
-{
-        struct kvec *iv = *data;
-        int i;
-        int len;
-        /*
-         *      Eat any sent kvecs
-         */
-        while (iv->iov_len <= amount) {
-                amount -= iv->iov_len;
-                iv++;
-                (*num)--;
-        }
-        /*
-         *      And chew down the partial one
-         */
-        vec[0].iov_len = iv->iov_len-amount;
-        vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
-        iv++;
-        len = vec[0].iov_len;
-        /*
-         *      And copy any others
-         */
-        for (i = 1; i < *num; i++) {
-                vec[i] = *iv++;
-                len += vec[i].iov_len;
-        }
-        *data = vec;
-        return len;
-}
-/*
- * smb_receive_header
- * Only called by the smbiod thread.
- */
-int
-smb_receive_header(struct smb_sb_info *server)
-{
-        struct socket *sock;
-        int result = 0;
-        unsigned char peek_buf[4];
-        result = -EIO; 
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        if (!server->smb_read) {
-                result = smb_get_length(sock, peek_buf);
-                if (result < 0) {
-                        if (result == -ENODATA)
-                                result = 0;
-                        goto out;
-                }
-                server->smb_len = result + 4;
-                if (server->smb_len < SMB_HEADER_LEN) {
-                        PARANOIA("short packet: %d\n", result);
-                        server->rstate = SMB_RECV_DROP;
-                        result = -EIO;
-                        goto out;
-                }
-                if (server->smb_len > SMB_MAX_PACKET_SIZE) {
-                        PARANOIA("long packet: %d\n", result);
-                        server->rstate = SMB_RECV_DROP;
-                        result = -EIO;
-                        goto out;
-                }
-        }
-        result = _recvfrom(sock, server->header + server->smb_read,
-                           SMB_HEADER_LEN - server->smb_read, 0);
-        VERBOSE("_recvfrom: %d\n", result);
-        if (result < 0) {
-                VERBOSE("receive error: %d\n", result);
-                goto out;
-        }
-        server->smb_read += result;
-        if (server->smb_read == SMB_HEADER_LEN)
-                server->rstate = SMB_RECV_HCOMPLETE;
-out:
-        return result;
-}
-static char drop_buffer[PAGE_SIZE];
-/*
- * smb_receive_drop - read and throw away the data
- * Only called by the smbiod thread.
- *
- * FIXME: we are in the kernel, could we just tell the socket that we want
- * to drop stuff from the buffer?
- */
-int
-smb_receive_drop(struct smb_sb_info *server)
-{
-        struct socket *sock;
-        unsigned int flags;
-        struct kvec iov;
-        struct msghdr msg;
-        int rlen = smb_len(server->header) - server->smb_read + 4;
-        int result = -EIO;
-        if (rlen > PAGE_SIZE)
-                rlen = PAGE_SIZE;
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-        iov.iov_base = drop_buffer;
-        iov.iov_len = PAGE_SIZE;
-        msg.msg_flags = flags;
-        msg.msg_name = NULL;
-        msg.msg_namelen = 0;
-        msg.msg_control = NULL;
-        result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
-        VERBOSE("read: %d\n", result);
-        if (result < 0) {
-                VERBOSE("receive error: %d\n", result);
-                goto out;
-        }
-        server->smb_read += result;
-        if (server->smb_read >= server->smb_len)
-                server->rstate = SMB_RECV_END;
-out:
-        return result;
-}
-/*
- * smb_receive
- * Only called by the smbiod thread.
- */
-int
-smb_receive(struct smb_sb_info *server, struct smb_request *req)
-{
-        struct socket *sock;
-        unsigned int flags;
-        struct kvec iov[4];
-        struct kvec *p = req->rq_iov;
-        size_t num = req->rq_iovlen;
-        struct msghdr msg;
-        int rlen;
-        int result = -EIO;
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-        msg.msg_flags = flags;
-        msg.msg_name = NULL;
-        msg.msg_namelen = 0;
-        msg.msg_control = NULL;
-        /* Dont repeat bytes and count available bufferspace */
-        rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
-                        (req->rq_rlen - req->rq_bytes_recvd));
-        result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
-        VERBOSE("read: %d\n", result);
-        if (result < 0) {
-                VERBOSE("receive error: %d\n", result);
-                goto out;
-        }
-        req->rq_bytes_recvd += result;
-        server->smb_read += result;
-out:
-        return result;
-}
-/*
- * Try to send a SMB request. This may return after sending only parts of the
- * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
- *
- * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
- */
-int
-smb_send_request(struct smb_request *req)
-{
-        struct smb_sb_info *server = req->rq_server;
-        struct socket *sock;
-        struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
-        int slen = req->rq_slen - req->rq_bytes_sent;
-        int result = -EIO;
-        struct kvec iov[4];
-        struct kvec *p = req->rq_iov;
-        size_t num = req->rq_iovlen;
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        /* Dont repeat bytes */
-        if (req->rq_bytes_sent)
-                smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
-        result = kernel_sendmsg(sock, &msg, p, num, slen);
-        if (result >= 0) {
-                req->rq_bytes_sent += result;
-                if (req->rq_bytes_sent >= req->rq_slen)
-                        req->rq_flags |= SMB_REQ_TRANSMITTED;
-        }
-out:
-        return result;
-}
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
deleted file mode 100644
index 00b2909bd469..000000000000
--- a/fs/smbfs/symlink.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  symlink.c
- *
- *  Copyright (C) 2002 by John Newbigin
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-#include <linux/namei.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-#include "smb_debug.h"
-#include "proto.h"
-int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
-{
-        DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
-        return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
-}
-static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        char *link = __getname();
-        DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
-        if (!link) {
-                link = ERR_PTR(-ENOMEM);
-        } else {
-                int len = smb_proc_read_link(server_from_dentry(dentry),
-                                                dentry, link, PATH_MAX - 1);
-                if (len < 0) {
-                        __putname(link);
-                        link = ERR_PTR(len);
-                } else {
-                        link[len] = 0;
-                }
-        }
-        nd_set_link(nd, link);
-        return NULL;
-}
-static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-        char *s = nd_get_link(nd);
-        if (!IS_ERR(s))
-                __putname(s);
-}
-const struct inode_operations smb_link_inode_operations =
-{
-        .readlink       = generic_readlink,
-        .follow_link    = smb_follow_link,
-        .put_link       = smb_put_link,
-};
diff --git a/fs/splice.c b/fs/splice.c
index 8f1dfaecc8f0..50a5d978da16 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 {
        struct file *file = sd->u.file;
        loff_t pos = sd->pos;
-        int ret, more;
+        int more;
-        ret = buf->ops->confirm(pipe, buf);
-        if (!ret) {
-                more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
-                if (file->f_op && file->f_op->sendpage)
-                        ret = file->f_op->sendpage(file, buf->page, buf->offset,
-                                                   sd->len, &pos, more);
-                else
-                        ret = -EINVAL;
-        }
-        return ret;
+        if (!likely(file->f_op && file->f_op->sendpage))
+                return -EINVAL;
+        more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+        return file->f_op->sendpage(file, buf->page, buf->offset,
+                                    sd->len, &pos, more);
 }
 /*
@@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        void *fsdata;
        int ret;
-        /*
-         * make sure the data in this buffer is uptodate
-         */
-        ret = buf->ops->confirm(pipe, buf);
-        if (unlikely(ret))
-                return ret;
        offset = sd->pos & ~PAGE_CACHE_MASK;
        this_len = sd->len;
@@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
                if (sd->len > sd->total_len)
                        sd->len = sd->total_len;
-                ret = actor(pipe, buf, sd);
+                ret = buf->ops->confirm(pipe, buf);
-                if (ret <= 0) {
+                if (unlikely(ret)) {
                        if (ret == -ENODATA)
                                ret = 0;
                        return ret;
                }
+                ret = actor(pipe, buf, sd);
+                if (ret <= 0)
+                        return ret;
                buf->offset += ret;
                buf->len -= ret;
@@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        int ret;
        void *data;
-        ret = buf->ops->confirm(pipe, buf);
-        if (ret)
-                return ret;
        data = buf->ops->map(pipe, buf, 0);
        ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
        buf->ops->unmap(pipe, buf, data);
@@ -1311,18 +1300,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
                               size_t len, unsigned int flags);
-/*
- * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
- * location, so checking ->i_pipe is not enough to verify that this is a
- * pipe.
- */
-static inline struct pipe_inode_info *pipe_info(struct inode *inode)
-{
-        if (S_ISFIFO(inode->i_mode))
-                return inode->i_pipe;
-        return NULL;
-}
 /*
 * Determine where to splice to/from.
@@ -1336,8 +1313,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
        loff_t offset, *off;
        long ret;
-        ipipe = pipe_info(in->f_path.dentry->d_inode);
+        ipipe = get_pipe_info(in);
-        opipe = pipe_info(out->f_path.dentry->d_inode);
+        opipe = get_pipe_info(out);
        if (ipipe && opipe) {
                if (off_in || off_out)
@@ -1507,10 +1484,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        char *src;
        int ret;
-        ret = buf->ops->confirm(pipe, buf);
-        if (unlikely(ret))
-                return ret;
        /*
         * See if we can use the atomic maps, by prefaulting in the
         * pages and doing an atomic copy
@@ -1555,7 +1528,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
        int error;
        long ret;
-        pipe = pipe_info(file->f_path.dentry->d_inode);
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -1642,7 +1615,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
        };
        long ret;
-        pipe = pipe_info(file->f_path.dentry->d_inode);
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -2022,8 +1995,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 static long do_tee(struct file *in, struct file *out, size_t len,
                   unsigned int flags)
 {
-        struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
+        struct pipe_inode_info *ipipe = get_pipe_info(in);
-        struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
+        struct pipe_inode_info *opipe = get_pipe_info(out);
        int ret = -EINVAL;
        /*
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index e5f63da64d04..aa68a8a31518 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -29,7 +29,6 @@ config SQUASHFS
 config SQUASHFS_XATTR
        bool "Squashfs XATTR support"
        depends on SQUASHFS
-        default n
        help
          Saying Y here includes support for extended attributes (xattrs).
          Xattrs are name:value pairs associated with inodes by
@@ -40,7 +39,6 @@ config SQUASHFS_XATTR
 config SQUASHFS_LZO
        bool "Include support for LZO compressed file systems"
        depends on SQUASHFS
-        default n
        select LZO_DECOMPRESS
        help
          Saying Y here includes support for reading Squashfs file systems
@@ -53,10 +51,24 @@ config SQUASHFS_LZO
          If unsure, say N.
+config SQUASHFS_XZ
+        bool "Include support for XZ compressed file systems"
+        depends on SQUASHFS
+        select XZ_DEC
+        help
+          Saying Y here includes support for reading Squashfs file systems
+          compressed with XZ compresssion.  XZ gives better compression than
+          the default zlib compression, at the expense of greater CPU and
+          memory overhead.
+          XZ is not the standard compression used in Squashfs and so most
+          file systems will be readable without selecting this option.
+          If unsure, say N.
 config SQUASHFS_EMBEDDED
        bool "Additional option for memory-constrained systems"
        depends on SQUASHFS
-        default n
        help
          Saying Y here allows you to specify cache size.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7672bac8d328..cecf2bea07af 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
 squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
 squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
+squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 653c030eb840..8ab48bc2fa7d 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,7 +34,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
@@ -64,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb,
                *length = (unsigned char) bh->b_data[*offset] |
                        (unsigned char) bh->b_data[*offset + 1] << 8;
                *offset += 2;
+                if (*offset == msblk->devblksize) {
+                        put_bh(bh);
+                        bh = sb_bread(sb, ++(*cur_index));
+                        if (bh == NULL)
+                                return NULL;
+                        *offset = 0;
+                }
        }
        return bh;
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 57314bee9059..26b15ae34d6f 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -55,7 +55,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 /*
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 24af9ce9722f..a5940e54c4dd 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -27,7 +27,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "decompressor.h"
 #include "squashfs.h"
@@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
 };
 #ifndef CONFIG_SQUASHFS_LZO
-static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
+static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
        NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
 };
 #endif
+#ifndef CONFIG_SQUASHFS_XZ
+static const struct squashfs_decompressor squashfs_xz_comp_ops = {
+        NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+};
+#endif
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
        NULL, NULL, NULL, 0, "unknown", 0
 };
 static const struct squashfs_decompressor *decompressor[] = {
        &squashfs_zlib_comp_ops,
-        &squashfs_lzma_unsupported_comp_ops,
-#ifdef CONFIG_SQUASHFS_LZO
        &squashfs_lzo_comp_ops,
-#else
+        &squashfs_xz_comp_ops,
-        &squashfs_lzo_unsupported_comp_ops,
+        &squashfs_lzma_unsupported_comp_ops,
-#endif
        &squashfs_unknown_comp_ops
 };
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 7425f80783f6..3b305a70f7aa 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
        return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
                length, srclength, pages);
 }
+#ifdef CONFIG_SQUASHFS_XZ
+extern const struct squashfs_decompressor squashfs_xz_comp_ops;
+#endif
+#ifdef CONFIG_SQUASHFS_LZO
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
+#endif
 #endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 12b933ac6585..0dc340aa2be9 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -230,5 +230,6 @@ failed_read:
 const struct file_operations squashfs_dir_ops = {
        .read = generic_read_dir,
-        .readdir = squashfs_readdir
+        .readdir = squashfs_readdir,
+        .llseek = default_llseek,
 };
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7c90bbd6879d..7eef571443c6 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -39,7 +39,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 /*
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index b7f64bcd2b70..d8f32452638e 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -37,7 +37,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 /*
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 5d87789bf1c1..7da759e34c52 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -29,7 +29,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5d45569d5f72..ba729d808876 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -27,11 +27,6 @@
 #define WARNING(s, args...)     pr_warning("SQUASHFS: "s, ## args)
-static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
-{
-        return list_entry(inode, struct squashfs_inode_info, vfs_inode);
-}
 /* block.c */
 extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
                                int, int);
@@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
-/* lzo_wrapper.c */
-extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index c5137fc9ab11..39533feffd6d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -238,6 +238,7 @@ struct meta_index {
 #define ZLIB_COMPRESSION        1
 #define LZMA_COMPRESSION        2
 #define LZO_COMPRESSION         3
+#define XZ_COMPRESSION          4
 struct squashfs_super_block {
        __le32                  s_magic;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d3e3a37f28a1..359baefc01fc 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -45,4 +45,10 @@ struct squashfs_inode_info {
        };
        struct inode    vfs_inode;
 };
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+        return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 88b4f8606652..20700b9f2b4c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,7 +30,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
@@ -354,8 +353,6 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 static void squashfs_put_super(struct super_block *sb)
 {
-        lock_kernel();
        if (sb->s_fs_info) {
                struct squashfs_sb_info *sbi = sb->s_fs_info;
                squashfs_cache_delete(sbi->block_cache);
@@ -370,17 +367,13 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
-        unlock_kernel();
 }
-static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
-                                const char *dev_name, void *data,
+                                const char *dev_name, void *data)
-                                struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
-                                mnt);
 }
@@ -447,16 +440,23 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb)
 }
-static void squashfs_destroy_inode(struct inode *inode)
+static void squashfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
 }
+static void squashfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, squashfs_i_callback);
+}
 static struct file_system_type squashfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "squashfs",
-        .get_sb = squashfs_get_sb,
+        .mount = squashfs_mount,
        .kill_sb = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV
 };
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 652b8541f9c6..3876c36699a1 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -158,17 +158,18 @@ static int squashfs_xattr_get(struct inode *inode, int name_index,
                                        strncmp(target, name, name_size) == 0) {
                        /* found xattr */
                        if (type & SQUASHFS_XATTR_VALUE_OOL) {
-                                __le64 xattr;
+                                __le64 xattr_val;
+                                u64 xattr;
                                /* val is a reference to the real location */
                                err = squashfs_read_metadata(sb, &val, &start,
                                                &offset, sizeof(val));
                                if (err < 0)
                                        goto failed;
-                                err = squashfs_read_metadata(sb, &xattr, &start,
+                                err = squashfs_read_metadata(sb, &xattr_val,
-                                         &offset, sizeof(xattr));
+                                        &start, &offset, sizeof(xattr_val));
                                if (err < 0)
                                        goto failed;
-                                xattr = le64_to_cpu(xattr);
+                                xattr = le64_to_cpu(xattr_val);
                                start = SQUASHFS_XATTR_BLK(xattr) +
                                                        msblk->xattr_table;
                                offset = SQUASHFS_XATTR_OFFSET(xattr);
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 49fe0d719fbf..b634efce4bde 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -25,7 +25,7 @@
 extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
                u64 *, int *);
 extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
-                int *, unsigned long long *);
+                unsigned int *, unsigned long long *);
 #else
 static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
                u64 start, u64 *xattr_table_start, int *xattr_ids)
@@ -35,7 +35,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
 }
 static inline int squashfs_xattr_lookup(struct super_block *sb,
-                unsigned int index, int *count, int *size,
+                unsigned int index, int *count, unsigned int *size,
                unsigned long long *xattr)
 {
        return 0;
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index cfb41106098f..05385dbe1465 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -32,8 +32,8 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Map xattr id using the xattr id look up table
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 000000000000..c4eb40018256
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,147 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xz_wrapper.c
+ */
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/xz.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+struct squashfs_xz {
+        struct xz_dec *state;
+        struct xz_buf buf;
+};
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
+{
+        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+        struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+        if (stream == NULL)
+                goto failed;
+        stream->state = xz_dec_init(XZ_PREALLOC, block_size);
+        if (stream->state == NULL)
+                goto failed;
+        return stream;
+failed:
+        ERROR("Failed to allocate xz workspace\n");
+        kfree(stream);
+        return NULL;
+}
+static void squashfs_xz_free(void *strm)
+{
+        struct squashfs_xz *stream = strm;
+        if (stream) {
+                xz_dec_end(stream->state);
+                kfree(stream);
+        }
+}
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+        struct buffer_head **bh, int b, int offset, int length, int srclength,
+        int pages)
+{
+        enum xz_ret xz_err;
+        int avail, total = 0, k = 0, page = 0;
+        struct squashfs_xz *stream = msblk->stream;
+        mutex_lock(&msblk->read_data_mutex);
+        xz_dec_reset(stream->state);
+        stream->buf.in_pos = 0;
+        stream->buf.in_size = 0;
+        stream->buf.out_pos = 0;
+        stream->buf.out_size = PAGE_CACHE_SIZE;
+        stream->buf.out = buffer[page++];
+        do {
+                if (stream->buf.in_pos == stream->buf.in_size && k < b) {
+                        avail = min(length, msblk->devblksize - offset);
+                        length -= avail;
+                        wait_on_buffer(bh[k]);
+                        if (!buffer_uptodate(bh[k]))
+                                goto release_mutex;
+                        stream->buf.in = bh[k]->b_data + offset;
+                        stream->buf.in_size = avail;
+                        stream->buf.in_pos = 0;
+                        offset = 0;
+                }
+                if (stream->buf.out_pos == stream->buf.out_size
+                                                        && page < pages) {
+                        stream->buf.out = buffer[page++];
+                        stream->buf.out_pos = 0;
+                        total += PAGE_CACHE_SIZE;
+                }
+                xz_err = xz_dec_run(stream->state, &stream->buf);
+                if (stream->buf.in_pos == stream->buf.in_size && k < b)
+                        put_bh(bh[k++]);
+        } while (xz_err == XZ_OK);
+        if (xz_err != XZ_STREAM_END) {
+                ERROR("xz_dec_run error, data probably corrupt\n");
+                goto release_mutex;
+        }
+        if (k < b) {
+                ERROR("xz_uncompress error, input remaining\n");
+                goto release_mutex;
+        }
+        total += stream->buf.out_pos;
+        mutex_unlock(&msblk->read_data_mutex);
+        return total;
+release_mutex:
+        mutex_unlock(&msblk->read_data_mutex);
+        for (; k < b; k++)
+                put_bh(bh[k]);
+        return -EIO;
+}
+const struct squashfs_decompressor squashfs_xz_comp_ops = {
+        .init = squashfs_xz_init,
+        .free = squashfs_xz_free,
+        .decompress = squashfs_xz_uncompress,
+        .id = XZ_COMPRESSION,
+        .name = "xz",
+        .supported = 1
+};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 7a603874e483..4661ae2b1cec 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -29,7 +29,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
@@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
        struct buffer_head **bh, int b, int offset, int length, int srclength,
        int pages)
 {
-        int zlib_err = 0, zlib_init = 0;
+        int zlib_err, zlib_init = 0;
-        int avail, bytes, k = 0, page = 0;
+        int k = 0, page = 0;
        z_stream *stream = msblk->stream;
        mutex_lock(&msblk->read_data_mutex);
@@ -75,21 +74,14 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
        stream->avail_out = 0;
        stream->avail_in = 0;
-        bytes = length;
        do {
                if (stream->avail_in == 0 && k < b) {
-                        avail = min(bytes, msblk->devblksize - offset);
+                        int avail = min(length, msblk->devblksize - offset);
-                        bytes -= avail;
+                        length -= avail;
                        wait_on_buffer(bh[k]);
                        if (!buffer_uptodate(bh[k]))
                                goto release_mutex;
-                        if (avail == 0) {
-                                offset = 0;
-                                put_bh(bh[k++]);
-                                continue;
-                        }
                        stream->next_in = bh[k]->b_data + offset;
                        stream->avail_in = avail;
                        offset = 0;
@@ -128,6 +120,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                goto release_mutex;
        }
+        if (k < b) {
+                ERROR("zlib_uncompress error, data remaining\n");
+                goto release_mutex;
+        }
        length = stream->total_out;
        mutex_unlock(&msblk->read_data_mutex);
        return length;
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e213900..d5c61cf2b703 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
        int error = -EINVAL;
        int lookup_flags = 0;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
                goto out;
        if (!(flag & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
+        if (flag & AT_NO_AUTOMOUNT)
+                lookup_flags |= LOOKUP_NO_AUTOMOUNT;
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
diff --git a/fs/super.c b/fs/super.c
index 8819e3a7ff20..74e149efed81 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -30,6 +30,7 @@
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/rculist_bl.h>
 #include "internal.h"
@@ -71,7 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_LIST_HEAD(&s->s_files);
 #endif
                INIT_LIST_HEAD(&s->s_instances);
-                INIT_HLIST_HEAD(&s->s_anon);
+                INIT_HLIST_BL_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
                INIT_LIST_HEAD(&s->s_dentry_lru);
                init_rwsem(&s->s_umount);
@@ -273,14 +274,14 @@ void generic_shutdown_super(struct super_block *sb)
                get_fs_excl();
                sb->s_flags &= ~MS_ACTIVE;
-                /* bad name - it should be evict_inodes() */
+                fsnotify_unmount_inodes(&sb->s_inodes);
-                invalidate_inodes(sb);
+                evict_inodes(sb);
                if (sop->put_super)
                        sop->put_super(sb);
-                /* Forget any remaining inodes */
+                if (!list_empty(&sb->s_inodes)) {
-                if (invalidate_inodes(sb)) {
                        printk("VFS: Busy inodes after unmount of %s. "
                           "Self-destruct in 5 seconds.  Have a nice day...\n",
                           sb->s_id);
@@ -715,15 +716,14 @@ static int ns_set_super(struct super_block *sb, void *data)
        return set_anon_super(sb, NULL);
 }
-int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
+struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
-        int (*fill_super)(struct super_block *, void *, int),
+        void *data, int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        struct super_block *sb;
        sb = sget(fs_type, ns_test_super, ns_set_super, data);
        if (IS_ERR(sb))
-                return PTR_ERR(sb);
+                return ERR_CAST(sb);
        if (!sb->s_root) {
                int err;
@@ -731,17 +731,16 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
                err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
                if (err) {
                        deactivate_locked_super(sb);
-                        return err;
+                        return ERR_PTR(err);
                }
                sb->s_flags |= MS_ACTIVE;
        }
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 }
-EXPORT_SYMBOL(get_sb_ns);
+EXPORT_SYMBOL(mount_ns);
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
@@ -762,22 +761,21 @@ static int test_bdev_super(struct super_block *s, void *data)
        return (void *)s->s_bdev == data;
 }
-int get_sb_bdev(struct file_system_type *fs_type,
+struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
+        int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        struct block_device *bdev;
        struct super_block *s;
-        fmode_t mode = FMODE_READ;
+        fmode_t mode = FMODE_READ | FMODE_EXCL;
        int error = 0;
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
-        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
-                return PTR_ERR(bdev);
+                return ERR_CAST(bdev);
        /*
         * once the super is inserted into the list by sget, s_umount
@@ -804,13 +802,13 @@ int get_sb_bdev(struct file_system_type *fs_type,
                /*
                 * s_umount nests inside bd_mutex during
-                 * __invalidate_device().  close_bdev_exclusive()
+                 * __invalidate_device().  blkdev_put() acquires
-                 * acquires bd_mutex and can't be called under
+                 * bd_mutex and can't be called under s_umount.  Drop
-                 * s_umount.  Drop s_umount temporarily.  This is safe
+                 * s_umount temporarily.  This is safe as we're
-                 * as we're holding an active reference.
+                 * holding an active reference.
                 */
                up_write(&s->s_umount);
-                close_bdev_exclusive(bdev, mode);
+                blkdev_put(bdev, mode);
                down_write(&s->s_umount);
        } else {
                char b[BDEVNAME_SIZE];
@@ -829,15 +827,30 @@ int get_sb_bdev(struct file_system_type *fs_type,
                bdev->bd_super = s;
        }
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 error_s:
        error = PTR_ERR(s);
 error_bdev:
-        close_bdev_exclusive(bdev, mode);
+        blkdev_put(bdev, mode);
 error:
-        return error;
+        return ERR_PTR(error);
+}
+EXPORT_SYMBOL(mount_bdev);
+int get_sb_bdev(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct dentry *root;
+        root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
+        return 0;
 }
 EXPORT_SYMBOL(get_sb_bdev);
@@ -850,35 +863,49 @@ void kill_block_super(struct super_block *sb)
        bdev->bd_super = NULL;
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
-        close_bdev_exclusive(bdev, mode);
+        WARN_ON_ONCE(!(mode & FMODE_EXCL));
+        blkdev_put(bdev, mode | FMODE_EXCL);
 }
 EXPORT_SYMBOL(kill_block_super);
 #endif
-int get_sb_nodev(struct file_system_type *fs_type,
+struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
+        int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        int error;
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        s->s_flags = flags;
        error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
        if (error) {
                deactivate_locked_super(s);
-                return error;
+                return ERR_PTR(error);
        }
        s->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 }
+EXPORT_SYMBOL(mount_nodev);
+int get_sb_nodev(struct file_system_type *fs_type,
+        int flags, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct dentry *root;
+        root = mount_nodev(fs_type, flags, data, fill_super);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
+        return 0;
+}
 EXPORT_SYMBOL(get_sb_nodev);
 static int compare_single(struct super_block *s, void *p)
@@ -886,29 +913,42 @@ static int compare_single(struct super_block *s, void *p)
        return 1;
 }
-int get_sb_single(struct file_system_type *fs_type,
+struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
+        int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        struct super_block *s;
        int error;
        s = sget(fs_type, compare_single, set_anon_super, NULL);
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        if (!s->s_root) {
                s->s_flags = flags;
                error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(s);
-                        return error;
+                        return ERR_PTR(error);
                }
                s->s_flags |= MS_ACTIVE;
        } else {
                do_remount_sb(s, flags, data, 0);
        }
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
+}
+EXPORT_SYMBOL(mount_single);
+int get_sb_single(struct file_system_type *fs_type,
+        int flags, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct dentry *root;
+        root = mount_single(fs_type, flags, data, fill_super);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
        return 0;
 }
@@ -918,6 +958,7 @@ struct vfsmount *
 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
        struct vfsmount *mnt;
+        struct dentry *root;
        char *secdata = NULL;
        int error;
@@ -942,9 +983,19 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
                        goto out_free_secdata;
        }
-        error = type->get_sb(type, flags, name, data, mnt);
+        if (type->mount) {
-        if (error < 0)
+                root = type->mount(type, flags, name, data);
-                goto out_free_secdata;
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        goto out_free_secdata;
+                }
+                mnt->mnt_root = root;
+                mnt->mnt_sb = root->d_sb;
+        } else {
+                error = type->get_sb(type, flags, name, data, mnt);
+                if (error < 0)
+                        goto out_free_secdata;
+        }
        BUG_ON(!mnt->mnt_sb);
        WARN_ON(!mnt->mnt_sb->s_bdi);
        mnt->mnt_sb->s_flags |= MS_BORN;
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index f4b67588b9d6..8c41feacbac5 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,5 +1,5 @@
 config SYSFS
-        bool "sysfs file system support" if EMBEDDED
+        bool "sysfs file system support" if EXPERT
        default y
        help
        The sysfs filesystem is a virtual filesystem that the kernel uses to
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 4e321f7353fa..a4759833d62d 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -179,30 +179,14 @@ static void bin_vma_open(struct vm_area_struct *vma)
        struct bin_buffer *bb = file->private_data;
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-        if (!bb->vm_ops || !bb->vm_ops->open)
+        if (!bb->vm_ops)
-                return;
-        if (!sysfs_get_active(attr_sd))
-                return;
-        bb->vm_ops->open(vma);
-        sysfs_put_active(attr_sd);
-}
-static void bin_vma_close(struct vm_area_struct *vma)
-{
-        struct file *file = vma->vm_file;
-        struct bin_buffer *bb = file->private_data;
-        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-        if (!bb->vm_ops || !bb->vm_ops->close)
                return;
        if (!sysfs_get_active(attr_sd))
                return;
-        bb->vm_ops->close(vma);
+        if (bb->vm_ops->open)
+                bb->vm_ops->open(vma);
        sysfs_put_active(attr_sd);
 }
@@ -214,13 +198,15 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        int ret;
-        if (!bb->vm_ops || !bb->vm_ops->fault)
+        if (!bb->vm_ops)
                return VM_FAULT_SIGBUS;
        if (!sysfs_get_active(attr_sd))
                return VM_FAULT_SIGBUS;
-        ret = bb->vm_ops->fault(vma, vmf);
+        ret = VM_FAULT_SIGBUS;
+        if (bb->vm_ops->fault)
+                ret = bb->vm_ops->fault(vma, vmf);
        sysfs_put_active(attr_sd);
        return ret;
@@ -236,13 +222,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (!bb->vm_ops)
                return VM_FAULT_SIGBUS;
-        if (!bb->vm_ops->page_mkwrite)
-                return 0;
        if (!sysfs_get_active(attr_sd))
                return VM_FAULT_SIGBUS;
-        ret = bb->vm_ops->page_mkwrite(vma, vmf);
+        ret = 0;
+        if (bb->vm_ops->page_mkwrite)
+                ret = bb->vm_ops->page_mkwrite(vma, vmf);
        sysfs_put_active(attr_sd);
        return ret;
@@ -256,13 +241,15 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        int ret;
-        if (!bb->vm_ops || !bb->vm_ops->access)
+        if (!bb->vm_ops)
                return -EINVAL;
        if (!sysfs_get_active(attr_sd))
                return -EINVAL;
-        ret = bb->vm_ops->access(vma, addr, buf, len, write);
+        ret = -EINVAL;
+        if (bb->vm_ops->access)
+                ret = bb->vm_ops->access(vma, addr, buf, len, write);
        sysfs_put_active(attr_sd);
        return ret;
@@ -276,13 +263,15 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        int ret;
-        if (!bb->vm_ops || !bb->vm_ops->set_policy)
+        if (!bb->vm_ops)
                return 0;
        if (!sysfs_get_active(attr_sd))
                return -EINVAL;
-        ret = bb->vm_ops->set_policy(vma, new);
+        ret = 0;
+        if (bb->vm_ops->set_policy)
+                ret = bb->vm_ops->set_policy(vma, new);
        sysfs_put_active(attr_sd);
        return ret;
@@ -296,13 +285,15 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct mempolicy *pol;
-        if (!bb->vm_ops || !bb->vm_ops->get_policy)
+        if (!bb->vm_ops)
                return vma->vm_policy;
        if (!sysfs_get_active(attr_sd))
                return vma->vm_policy;
-        pol = bb->vm_ops->get_policy(vma, addr);
+        pol = vma->vm_policy;
+        if (bb->vm_ops->get_policy)
+                pol = bb->vm_ops->get_policy(vma, addr);
        sysfs_put_active(attr_sd);
        return pol;
@@ -316,13 +307,15 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        int ret;
-        if (!bb->vm_ops || !bb->vm_ops->migrate)
+        if (!bb->vm_ops)
                return 0;
        if (!sysfs_get_active(attr_sd))
                return 0;
-        ret = bb->vm_ops->migrate(vma, from, to, flags);
+        ret = 0;
+        if (bb->vm_ops->migrate)
+                ret = bb->vm_ops->migrate(vma, from, to, flags);
        sysfs_put_active(attr_sd);
        return ret;
@@ -331,7 +324,6 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
 static const struct vm_operations_struct bin_vm_ops = {
        .open           = bin_vma_open,
-        .close          = bin_vma_close,
        .fault          = bin_fault,
        .page_mkwrite   = bin_page_mkwrite,
        .access         = bin_access,
@@ -377,6 +369,14 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
        if (bb->mmapped && bb->vm_ops != vma->vm_ops)
                goto out_put;
+        /*
+         * It is not possible to successfully wrap close.
+         * So error if someone is trying to use close.
+         */
+        rc = -EINVAL;
+        if (vma->vm_ops && vma->vm_ops->close)
+                goto out_put;
        rc = 0;
        bb->mmapped = 1;
        bb->vm_ops = vma->vm_ops;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7e54bac8c4b0..ea9120a830d8 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -231,7 +231,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
                goto repeat;
 }
-static int sysfs_dentry_delete(struct dentry *dentry)
+static int sysfs_dentry_delete(const struct dentry *dentry)
 {
        struct sysfs_dirent *sd = dentry->d_fsdata;
        return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
@@ -239,9 +239,13 @@ static int sysfs_dentry_delete(struct dentry *dentry)
 static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct sysfs_dirent *sd = dentry->d_fsdata;
+        struct sysfs_dirent *sd;
        int is_dir;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        sd = dentry->d_fsdata;
        mutex_lock(&sysfs_mutex);
        /* The sysfs dirent has been deleted */
@@ -701,7 +705,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
        /* instantiate and hash dentry */
        ret = d_find_alias(inode);
        if (!ret) {
-                dentry->d_op = &sysfs_dentry_ops;
+                d_set_d_op(dentry, &sysfs_dentry_ops);
                dentry->d_fsdata = sysfs_get(sd);
                d_add(dentry, inode);
        } else {
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 23c1e598792a..c8769dc222d8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -148,6 +148,59 @@ void sysfs_remove_group(struct kobject * kobj,
        sysfs_put(sd);
 }
+/**
+ * sysfs_merge_group - merge files into a pre-existing attribute group.
+ * @kobj:       The kobject containing the group.
+ * @grp:        The files to create and the attribute group they belong to.
+ *
+ * This function returns an error if the group doesn't exist or any of the
+ * files already exist in that group, in which case none of the new files
+ * are created.
+ */
+int sysfs_merge_group(struct kobject *kobj,
+                       const struct attribute_group *grp)
+{
+        struct sysfs_dirent *dir_sd;
+        int error = 0;
+        struct attribute *const *attr;
+        int i;
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+        if (!dir_sd)
+                return -ENOENT;
+        for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
+                error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+        if (error) {
+                while (--i >= 0)
+                        sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
+        }
+        sysfs_put(dir_sd);
+        return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_merge_group);
+/**
+ * sysfs_unmerge_group - remove files from a pre-existing attribute group.
+ * @kobj:       The kobject containing the group.
+ * @grp:        The files to remove and the attribute group they belong to.
+ */
+void sysfs_unmerge_group(struct kobject *kobj,
+                       const struct attribute_group *grp)
+{
+        struct sysfs_dirent *dir_sd;
+        struct attribute *const *attr;
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+        if (dir_sd) {
+                for (attr = grp->attrs; *attr; ++attr)
+                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+                sysfs_put(dir_sd);
+        }
+}
+EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
 EXPORT_SYMBOL_GPL(sysfs_create_group);
 EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index cffb1fd8ba33..0a12eb89cd32 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/sysfs.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
@@ -348,13 +349,18 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
                return -ENOENT;
 }
-int sysfs_permission(struct inode *inode, int mask)
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct sysfs_dirent *sd = inode->i_private;
+        struct sysfs_dirent *sd;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        sd = inode->i_private;
        mutex_lock(&sysfs_mutex);
        sysfs_refresh_inode(sd, inode);
        mutex_unlock(&sysfs_mutex);
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f2af22574c50..266895783b47 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,7 +23,7 @@
 #include "sysfs.h"
-static struct vfsmount *sysfs_mount;
+static struct vfsmount *sysfs_mnt;
 struct kmem_cache *sysfs_dir_cachep;
 static const struct super_operations sysfs_ops = {
@@ -95,18 +95,17 @@ static int sysfs_set_super(struct super_block *sb, void *data)
        return error;
 }
-static int sysfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *sysfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
        struct sysfs_super_info *info;
        enum kobj_ns_type type;
        struct super_block *sb;
        int error;
-        error = -ENOMEM;
        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (!info)
-                goto out;
+                return ERR_PTR(-ENOMEM);
        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
                info->ns[type] = kobj_ns_current(type);
@@ -114,24 +113,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
        sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
        if (IS_ERR(sb) || sb->s_fs_info != info)
                kfree(info);
-        if (IS_ERR(sb)) {
+        if (IS_ERR(sb))
-                error = PTR_ERR(sb);
+                return ERR_CAST(sb);
-                goto out;
-        }
        if (!sb->s_root) {
                sb->s_flags = flags;
                error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(sb);
-                        goto out;
+                        return ERR_PTR(error);
                }
                sb->s_flags |= MS_ACTIVE;
        }
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        error = 0;
-out:
-        return error;
 }
 static void sysfs_kill_sb(struct super_block *sb)
@@ -147,7 +141,7 @@ static void sysfs_kill_sb(struct super_block *sb)
 static struct file_system_type sysfs_fs_type = {
        .name           = "sysfs",
-        .get_sb         = sysfs_get_sb,
+        .mount          = sysfs_mount,
        .kill_sb        = sysfs_kill_sb,
 };
@@ -189,11 +183,11 @@ int __init sysfs_init(void)
        err = register_filesystem(&sysfs_fs_type);
        if (!err) {
-                sysfs_mount = kern_mount(&sysfs_fs_type);
+                sysfs_mnt = kern_mount(&sysfs_fs_type);
-                if (IS_ERR(sysfs_mount)) {
+                if (IS_ERR(sysfs_mnt)) {
                        printk(KERN_ERR "sysfs: could not mount!\n");
-                        err = PTR_ERR(sysfs_mount);
+                        err = PTR_ERR(sysfs_mnt);
-                        sysfs_mount = NULL;
+                        sysfs_mnt = NULL;
                        unregister_filesystem(&sysfs_fs_type);
                        goto out_err;
                }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d9be60a2e956..3d28af31d863 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
 */
 #include <linux/lockdep.h>
+#include <linux/kobject_ns.h>
 #include <linux/fs.h>
 struct sysfs_open_dirent;
@@ -200,7 +201,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_evict_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index de44d067b9e6..0630eb969a28 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -333,11 +333,18 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
        return &si->vfs_inode;
 }
-static void sysv_destroy_inode(struct inode *inode)
+static void sysv_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
+static void sysv_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, sysv_i_callback);
+}
 static void init_once(void *p)
 {
        struct sysv_inode_info *si = (struct sysv_inode_info *)p;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b59b8d..b427b1208c26 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,7 +27,8 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
        return err;
 }
-static int sysv_hash(struct dentry *dentry, struct qstr *qstr)
+static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        /* Truncate the name in place, avoids having to define a compare
           function. */
@@ -47,7 +48,6 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
        struct inode * inode = NULL;
        ino_t ino;
-        dentry->d_op = dir->i_sb->s_root->d_op;
        if (dentry->d_name.len > SYSV_NAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
        ino = sysv_inode_by_name(dentry);
@@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return add_nondir(dentry, inode);
 }
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index a0b0cda6927e..f60c196913ea 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -332,6 +332,10 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
        sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
        /* set up enough so that it can read an inode */
        sb->s_op = &sysv_sops;
+        if (sbi->s_forced_ro)
+                sb->s_flags |= MS_RDONLY;
+        if (sbi->s_truncate)
+                sb->s_d_op = &sysv_dentry_operations;
        root_inode = sysv_iget(sb, SYSV_ROOT_INO);
        if (IS_ERR(root_inode)) {
                printk("SysV FS: get root inode failed\n");
@@ -343,10 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
                printk("SysV FS: get root dentry failed\n");
                return 0;
        }
-        if (sbi->s_forced_ro)
-                sb->s_flags |= MS_RDONLY;
-        if (sbi->s_truncate)
-                sb->s_root->d_op = &sysv_dentry_operations;
        return 1;
 }
@@ -526,23 +526,22 @@ failed:
 /* Every kernel module contains stuff like this. */
-static int sysv_get_sb(struct file_system_type *fs_type,
+static struct dentry *sysv_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
-                           mnt);
 }
-static int v7_get_sb(struct file_system_type *fs_type,
+static struct dentry *v7_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
 }
 static struct file_system_type sysv_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "sysv",
-        .get_sb         = sysv_get_sb,
+        .mount          = sysv_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -550,7 +549,7 @@ static struct file_system_type sysv_fs_type = {
 static struct file_system_type v7_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "v7",
-        .get_sb         = v7_get_sb,
+        .mount          = v7_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b86ab8eff79a..8c4fc1425b3e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -144,6 +144,7 @@ static const struct file_operations timerfd_fops = {
        .release        = timerfd_release,
        .poll           = timerfd_poll,
        .read           = timerfd_read,
+        .llseek         = noop_llseek,
 };
 static struct file *timerfd_fget(int fd)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 37fa7ed062d8..02429d81ca33 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -63,7 +63,9 @@ static int do_commit(struct ubifs_info *c)
        struct ubifs_lp_stats lst;
        dbg_cmt("start");
-        if (c->ro_media) {
+        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->ro_error) {
                err = -EROFS;
                goto out_up;
        }
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2a68baa782f..0bee4dbffc31 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2239,6 +2239,162 @@ out_free:
        return err;
 }
+/**
+ * dbg_check_data_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+        struct list_head *cur;
+        struct ubifs_scan_node *sa, *sb;
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        for (cur = head->next; cur->next != head; cur = cur->next) {
+                ino_t inuma, inumb;
+                uint32_t blka, blkb;
+                cond_resched();
+                sa = container_of(cur, struct ubifs_scan_node, list);
+                sb = container_of(cur->next, struct ubifs_scan_node, list);
+                if (sa->type != UBIFS_DATA_NODE) {
+                        ubifs_err("bad node type %d", sa->type);
+                        dbg_dump_node(c, sa->node);
+                        return -EINVAL;
+                }
+                if (sb->type != UBIFS_DATA_NODE) {
+                        ubifs_err("bad node type %d", sb->type);
+                        dbg_dump_node(c, sb->node);
+                        return -EINVAL;
+                }
+                inuma = key_inum(c, &sa->key);
+                inumb = key_inum(c, &sb->key);
+                if (inuma < inumb)
+                        continue;
+                if (inuma > inumb) {
+                        ubifs_err("larger inum %lu goes before inum %lu",
+                                  (unsigned long)inuma, (unsigned long)inumb);
+                        goto error_dump;
+                }
+                blka = key_block(c, &sa->key);
+                blkb = key_block(c, &sb->key);
+                if (blka > blkb) {
+                        ubifs_err("larger block %u goes before %u", blka, blkb);
+                        goto error_dump;
+                }
+                if (blka == blkb) {
+                        ubifs_err("two data nodes for the same block");
+                        goto error_dump;
+                }
+        }
+        return 0;
+error_dump:
+        dbg_dump_node(c, sa->node);
+        dbg_dump_node(c, sb->node);
+        return -EINVAL;
+}
+/**
+ * dbg_check_nondata_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of non-data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+        struct list_head *cur;
+        struct ubifs_scan_node *sa, *sb;
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        for (cur = head->next; cur->next != head; cur = cur->next) {
+                ino_t inuma, inumb;
+                uint32_t hasha, hashb;
+                cond_resched();
+                sa = container_of(cur, struct ubifs_scan_node, list);
+                sb = container_of(cur->next, struct ubifs_scan_node, list);
+                if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+                    sa->type != UBIFS_XENT_NODE) {
+                        ubifs_err("bad node type %d", sa->type);
+                        dbg_dump_node(c, sa->node);
+                        return -EINVAL;
+                }
+                if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+                    sa->type != UBIFS_XENT_NODE) {
+                        ubifs_err("bad node type %d", sb->type);
+                        dbg_dump_node(c, sb->node);
+                        return -EINVAL;
+                }
+                if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+                        ubifs_err("non-inode node goes before inode node");
+                        goto error_dump;
+                }
+                if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE)
+                        continue;
+                if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+                        /* Inode nodes are sorted in descending size order */
+                        if (sa->len < sb->len) {
+                                ubifs_err("smaller inode node goes first");
+                                goto error_dump;
+                        }
+                        continue;
+                }
+                /*
+                 * This is either a dentry or xentry, which should be sorted in
+                 * ascending (parent ino, hash) order.
+                 */
+                inuma = key_inum(c, &sa->key);
+                inumb = key_inum(c, &sb->key);
+                if (inuma < inumb)
+                        continue;
+                if (inuma > inumb) {
+                        ubifs_err("larger inum %lu goes before inum %lu",
+                                  (unsigned long)inuma, (unsigned long)inumb);
+                        goto error_dump;
+                }
+                hasha = key_block(c, &sa->key);
+                hashb = key_block(c, &sb->key);
+                if (hasha > hashb) {
+                        ubifs_err("larger hash %u goes before %u", hasha, hashb);
+                        goto error_dump;
+                }
+        }
+        return 0;
+error_dump:
+        ubifs_msg("dumping first node");
+        dbg_dump_node(c, sa->node);
+        ubifs_msg("dumping second node");
+        dbg_dump_node(c, sb->node);
+        return -EINVAL;
+        return 0;
+}
 static int invocation_cnt;
 int dbg_force_in_the_gaps(void)
@@ -2625,6 +2781,7 @@ static const struct file_operations dfs_fops = {
        .open = open_debugfs_file,
        .write = write_debugfs_file,
        .owner = THIS_MODULE,
+        .llseek = default_llseek,
 };
 /**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 29d960101ea6..69ebe4729151 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -324,6 +324,8 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
                        int row, int col);
 int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
                         loff_t size);
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
 /* Force the use of in-the-gaps method for testing */
@@ -465,6 +467,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define dbg_check_lprops(c)                        0
 #define dbg_check_lpt_nodes(c, cnode, row, col)    0
 #define dbg_check_inode_size(c, inode, size)       0
+#define dbg_check_data_nodes_order(c, head)        0
+#define dbg_check_nondata_nodes_order(c, head)     0
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce72213..14f64b689d7f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        lock_2_inodes(dir, inode);
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_ctime = ubifs_current_time(inode);
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 03ae894c45de..d77db7e36484 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -433,8 +433,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        struct page *page;
        ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
+        ubifs_assert(!c->ro_media && !c->ro_mount);
-        if (unlikely(c->ro_media))
+        if (unlikely(c->ro_error))
                return -EROFS;
        /* Try out the fast-path part first */
@@ -1439,9 +1440,9 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vm
        dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
                i_size_read(inode));
-        ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
+        ubifs_assert(!c->ro_media && !c->ro_mount);
-        if (unlikely(c->ro_media))
+        if (unlikely(c->ro_error))
                return VM_FAULT_SIGBUS; /* -EROFS */
        /*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 918d1582ca05..151f10882820 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -125,10 +125,16 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
        struct ubifs_scan_node *sa, *sb;
        cond_resched();
+        if (a == b)
+                return 0;
        sa = list_entry(a, struct ubifs_scan_node, list);
        sb = list_entry(b, struct ubifs_scan_node, list);
        ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
        ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+        ubifs_assert(sa->type == UBIFS_DATA_NODE);
+        ubifs_assert(sb->type == UBIFS_DATA_NODE);
        inuma = key_inum(c, &sa->key);
        inumb = key_inum(c, &sb->key);
@@ -157,28 +163,40 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 */
 int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
-        int typea, typeb;
        ino_t inuma, inumb;
        struct ubifs_info *c = priv;
        struct ubifs_scan_node *sa, *sb;
        cond_resched();
+        if (a == b)
+                return 0;
        sa = list_entry(a, struct ubifs_scan_node, list);
        sb = list_entry(b, struct ubifs_scan_node, list);
-        typea = key_type(c, &sa->key);
-        typeb = key_type(c, &sb->key);
+        ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY &&
-        ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+                     key_type(c, &sb->key) != UBIFS_DATA_KEY);
+        ubifs_assert(sa->type != UBIFS_DATA_NODE &&
+                     sb->type != UBIFS_DATA_NODE);
        /* Inodes go before directory entries */
-        if (typea == UBIFS_INO_KEY) {
+        if (sa->type == UBIFS_INO_NODE) {
-                if (typeb == UBIFS_INO_KEY)
+                if (sb->type == UBIFS_INO_NODE)
                        return sb->len - sa->len;
                return -1;
        }
-        if (typeb == UBIFS_INO_KEY)
+        if (sb->type == UBIFS_INO_NODE)
                return 1;
-        ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+        ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY ||
+                     key_type(c, &sa->key) == UBIFS_XENT_KEY);
+        ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY ||
+                     key_type(c, &sb->key) == UBIFS_XENT_KEY);
+        ubifs_assert(sa->type == UBIFS_DENT_NODE ||
+                     sa->type == UBIFS_XENT_NODE);
+        ubifs_assert(sb->type == UBIFS_DENT_NODE ||
+                     sb->type == UBIFS_XENT_NODE);
        inuma = key_inum(c, &sa->key);
        inumb = key_inum(c, &sb->key);
@@ -224,17 +242,33 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
                      struct list_head *nondata, int *min)
 {
+        int err;
        struct ubifs_scan_node *snod, *tmp;
        *min = INT_MAX;
        /* Separate data nodes and non-data nodes */
        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-                int err;
+                ubifs_assert(snod->type == UBIFS_INO_NODE  ||
+                             snod->type == UBIFS_DATA_NODE ||
+                             snod->type == UBIFS_DENT_NODE ||
+                             snod->type == UBIFS_XENT_NODE ||
+                             snod->type == UBIFS_TRUN_NODE);
+                if (snod->type != UBIFS_INO_NODE  &&
+                    snod->type != UBIFS_DATA_NODE &&
+                    snod->type != UBIFS_DENT_NODE &&
+                    snod->type != UBIFS_XENT_NODE) {
+                        /* Probably truncation node, zap it */
+                        list_del(&snod->list);
+                        kfree(snod);
+                        continue;
+                }
-                ubifs_assert(snod->type != UBIFS_IDX_NODE);
+                ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY ||
-                ubifs_assert(snod->type != UBIFS_REF_NODE);
+                             key_type(c, &snod->key) == UBIFS_INO_KEY  ||
-                ubifs_assert(snod->type != UBIFS_CS_NODE);
+                             key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+                             key_type(c, &snod->key) == UBIFS_XENT_KEY);
                err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
                                         snod->offs, 0);
@@ -258,6 +292,13 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
        /* Sort data and non-data nodes */
        list_sort(c, &sleb->nodes, &data_nodes_cmp);
        list_sort(c, nondata, &nondata_nodes_cmp);
+        err = dbg_check_data_nodes_order(c, &sleb->nodes);
+        if (err)
+                return err;
+        err = dbg_check_nondata_nodes_order(c, nondata);
+        if (err)
+                return err;
        return 0;
 }
@@ -575,13 +616,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
        ubifs_assert_cmt_locked(c);
+        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (ubifs_gc_should_commit(c))
                return -EAGAIN;
        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
-        if (c->ro_media) {
+        if (c->ro_error) {
                ret = -EROFS;
                goto out_unlock;
        }
@@ -677,14 +719,12 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
                ret = ubifs_garbage_collect_leb(c, &lp);
                if (ret < 0) {
-                        if (ret == -EAGAIN || ret == -ENOSPC) {
+                        if (ret == -EAGAIN) {
                                /*
-                                 * These codes are not errors, so we have to
+                                 * This is not error, so we have to return the
-                                 * return the LEB to lprops. But if the
+                                 * LEB to lprops. But if 'ubifs_return_leb()'
-                                 * 'ubifs_return_leb()' function fails, its
+                                 * fails, its failure code is propagated to the
-                                 * failure code is propagated to the caller
+                                 * caller instead of the original '-EAGAIN'.
-                                 * instead of the original '-EAGAIN' or
-                                 * '-ENOSPC'.
                                 */
                                err = ubifs_return_leb(c, lp.lnum);
                                if (err)
@@ -774,8 +814,8 @@ out_unlock:
 out:
        ubifs_assert(ret < 0);
        ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
-        ubifs_ro_mode(c, ret);
        ubifs_wbuf_sync_nolock(wbuf);
+        ubifs_ro_mode(c, ret);
        mutex_unlock(&wbuf->io_mutex);
        ubifs_return_leb(c, lp.lnum);
        return ret;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index bcf5a16f30bb..d82173182eeb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -61,8 +61,8 @@
 */
 void ubifs_ro_mode(struct ubifs_info *c, int err)
 {
-        if (!c->ro_media) {
+        if (!c->ro_error) {
-                c->ro_media = 1;
+                c->ro_error = 1;
                c->no_chk_data_crc = 0;
                c->vfs_sb->s_flags |= MS_RDONLY;
                ubifs_warn("switched to read-only mode, error %d", err);
@@ -356,11 +356,11 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
        dbg_io("LEB %d:%d, %d bytes, jhead %s",
               wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
-        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
        ubifs_assert(!(wbuf->avail & 7));
        ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+        ubifs_assert(!c->ro_media && !c->ro_mount);
-        if (c->ro_media)
+        if (c->ro_error)
                return -EROFS;
        ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
@@ -440,11 +440,12 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
 {
        int err, i;
+        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (!c->need_wbuf_sync)
                return 0;
        c->need_wbuf_sync = 0;
-        if (c->ro_media) {
+        if (c->ro_error) {
                err = -EROFS;
                goto out_timers;
        }
@@ -519,6 +520,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
        ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
        ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
+        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
                err = -ENOSPC;
@@ -527,7 +529,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        cancel_wbuf_timer_nolock(wbuf);
-        if (c->ro_media)
+        if (c->ro_error)
                return -EROFS;
        if (aligned_len <= wbuf->avail) {
@@ -663,8 +665,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
               buf_len);
        ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
        ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
+        ubifs_assert(!c->ro_media && !c->ro_mount);
-        if (c->ro_media)
+        if (c->ro_error)
                return -EROFS;
        ubifs_prepare_node(c, buf, len, 1);
@@ -815,7 +818,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
        return 0;
 out:
-        ubifs_err("bad node at LEB %d:%d", lnum, offs);
+        ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
+                  ubi_is_mapped(c->ubi, lnum));
        dbg_dump_node(c, buf);
        dbg_dump_stack();
        return -EINVAL;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index d321baeca68d..914f1bd89e57 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -122,11 +122,12 @@ static int reserve_space(struct ubifs_info *c, int jhead, int len)
         * better to try to allocate space at the ends of eraseblocks. This is
         * what the squeeze parameter does.
         */
+        ubifs_assert(!c->ro_media && !c->ro_mount);
        squeeze = (jhead == BASEHD);
 again:
        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
-        if (c->ro_media) {
+        if (c->ro_error) {
                err = -EROFS;
                goto out_unlock;
        }
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 0f530c684f0b..92a8491a8f8c 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -306,6 +306,20 @@ static inline void trun_key_init(const struct ubifs_info *c,
 }
 /**
+ * invalid_key_init - initialize invalid node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ *
+ * This is a helper function which marks a @key object as invalid.
+ */
+static inline void invalid_key_init(const struct ubifs_info *c,
+                                    union ubifs_key *key)
+{
+        key->u32[0] = 0xDEADBEAF;
+        key->u32[1] = UBIFS_INVALID_KEY;
+}
+/**
 * key_type - get key type.
 * @c: UBIFS file-system description object
 * @key: key to get type of
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index c345e125f42c..4d0cb1241460 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -159,7 +159,7 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
                jhead = &c->jheads[bud->jhead];
                list_add_tail(&bud->list, &jhead->buds_list);
        } else
-                ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY));
+                ubifs_assert(c->replaying && c->ro_mount);
        /*
         * Note, although this is a new bud, we anyway account this space now,
@@ -223,8 +223,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        }
        mutex_lock(&c->log_mutex);
+        ubifs_assert(!c->ro_media && !c->ro_mount);
-        if (c->ro_media) {
+        if (c->ro_error) {
                err = -EROFS;
                goto out_unlock;
        }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 0084a33c4c69..72775d35b99e 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1363,6 +1363,7 @@ static int read_lsave(struct ubifs_info *c)
                goto out;
        for (i = 0; i < c->lsave_cnt; i++) {
                int lnum = c->lsave[i];
+                struct ubifs_lprops *lprops;
                /*
                 * Due to automatic resizing, the values in the lsave table
@@ -1370,7 +1371,11 @@ static int read_lsave(struct ubifs_info *c)
                 */
                if (lnum >= c->leb_cnt)
                        continue;
-                ubifs_lpt_lookup(c, lnum);
+                lprops = ubifs_lpt_lookup(c, lnum);
+                if (IS_ERR(lprops)) {
+                        err = PTR_ERR(lprops);
+                        goto out;
+                }
        }
 out:
        vfree(buf);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index d12535b7fc78..5c90dec5db0b 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -705,6 +705,9 @@ static int make_tree_dirty(struct ubifs_info *c)
        struct ubifs_pnode *pnode;
        pnode = pnode_lookup(c, 0);
+        if (IS_ERR(pnode))
+                return PTR_ERR(pnode);
        while (pnode) {
                do_make_pnode_dirty(c, pnode);
                pnode = next_pnode_to_dirty(c, pnode);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 28beaeedadc0..21f47afdacff 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -361,7 +361,8 @@ int ubifs_write_master(struct ubifs_info *c)
 {
        int err, lnum, offs, len;
-        if (c->ro_media)
+        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->ro_error)
                return -EROFS;
        lnum = UBIFS_MST_LNUM;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4fa81d867e41..c3de04dc952a 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -132,7 +132,8 @@ static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
 {
        int err;
-        if (c->ro_media)
+        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->ro_error)
                return -EROFS;
        err = ubi_leb_unmap(c->ubi, lnum);
        if (err) {
@@ -159,7 +160,8 @@ static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
 {
        int err;
-        if (c->ro_media)
+        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->ro_error)
                return -EROFS;
        err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
        if (err) {
@@ -186,7 +188,8 @@ static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
 {
        int err;
-        if (c->ro_media)
+        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->ro_error)
                return -EROFS;
        err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
        if (err) {
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index daae9e1f5382..77e9b874b6c2 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -292,7 +292,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
        memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
-        if ((c->vfs_sb->s_flags & MS_RDONLY)) {
+        if (c->ro_mount) {
                /* Read-only mode. Keep a copy for switching to rw mode */
                c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
                if (!c->rcvrd_mst_node) {
@@ -469,7 +469,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
                endpt = snod->offs + snod->len;
        }
-        if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) {
+        if (c->ro_mount && !c->remounting_rw) {
                /* Add to recovery list */
                struct ubifs_unclean_leb *ucleb;
@@ -772,7 +772,8 @@ out_free:
 * @sbuf: LEB-sized buffer to use
 *
 * This function does a scan of a LEB, but caters for errors that might have
- * been caused by the unclean unmount from which we are attempting to recover.
+ * been caused by unclean reboots from which we are attempting to recover
+ * (assume that only the last log LEB can be corrupted by an unclean reboot).
 *
 * This function returns %0 on success and a negative error code on failure.
 */
@@ -883,7 +884,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
 {
        int err;
-        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw);
+        ubifs_assert(!c->ro_mount || c->remounting_rw);
        dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
        err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
@@ -1461,7 +1462,7 @@ int ubifs_recover_size(struct ubifs_info *c)
                        }
                }
                if (e->exists && e->i_size < e->d_size) {
-                        if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) {
+                        if (!e->inode && c->ro_mount) {
                                /* Fix the inode size and pin it in memory */
                                struct inode *inode;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 5c2d6d759a3e..eed0fcff8d73 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -627,8 +627,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
        ubifs_assert(sleb->endpt - offs >= used);
        ubifs_assert(sleb->endpt % c->min_io_size == 0);
-        if (sleb->endpt + c->min_io_size <= c->leb_size &&
+        if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount)
-            !(c->vfs_sb->s_flags & MS_RDONLY))
                err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
                                             sleb->endpt, UBI_SHORTTERM);
@@ -840,6 +839,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
        if (IS_ERR(sleb)) {
                if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
                        return PTR_ERR(sleb);
+                /*
+                 * Note, the below function will recover this log LEB only if
+                 * it is the last, because unclean reboots can possibly corrupt
+                 * only the tail of the log.
+                 */
                sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
                if (IS_ERR(sleb))
                        return PTR_ERR(sleb);
@@ -851,7 +855,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
        }
        node = sleb->buf;
        snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
        if (c->cs_sqnum == 0) {
                /*
@@ -898,7 +901,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
        }
        list_for_each_entry(snod, &sleb->nodes, list) {
                cond_resched();
                if (snod->sqnum >= SQNUM_WATERMARK) {
@@ -1011,7 +1013,6 @@ out:
 int ubifs_replay_journal(struct ubifs_info *c)
 {
        int err, i, lnum, offs, free;
-        void *sbuf = NULL;
        BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
@@ -1026,14 +1027,8 @@ int ubifs_replay_journal(struct ubifs_info *c)
                return -EINVAL;
        }
-        sbuf = vmalloc(c->leb_size);
-        if (!sbuf)
-                return -ENOMEM;
        dbg_mnt("start replaying the journal");
        c->replaying = 1;
        lnum = c->ltail_lnum = c->lhead_lnum;
        offs = c->lhead_offs;
@@ -1046,7 +1041,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
                        lnum = UBIFS_LOG_LNUM;
                        offs = 0;
                }
-                err = replay_log_leb(c, lnum, offs, sbuf);
+                err = replay_log_leb(c, lnum, offs, c->sbuf);
                if (err == 1)
                        /* We hit the end of the log */
                        break;
@@ -1079,7 +1074,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
 out:
        destroy_replay_tree(c);
        destroy_bud_list(c);
-        vfree(sbuf);
        c->replaying = 0;
        return err;
 }
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 96cb62c8a9dd..bf31b4729e51 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -542,11 +542,8 @@ int ubifs_read_superblock(struct ubifs_info *c)
         * due to the unavailability of time-travelling equipment.
         */
        if (c->fmt_version > UBIFS_FORMAT_VERSION) {
-                struct super_block *sb = c->vfs_sb;
+                ubifs_assert(!c->ro_media || c->ro_mount);
-                int mounting_ro = sb->s_flags & MS_RDONLY;
+                if (!c->ro_mount ||
-                ubifs_assert(!c->ro_media || mounting_ro);
-                if (!mounting_ro ||
                    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
                        ubifs_err("on-flash format version is w%d/r%d, but "
                                  "software only supports up to version "
@@ -624,7 +621,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->old_leb_cnt = c->leb_cnt;
        if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
                c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
-                if (c->vfs_sb->s_flags & MS_RDONLY)
+                if (c->ro_mount)
                        dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
                                c->old_leb_cnt, c->leb_cnt);
                else {
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 96c525384191..3e1ee57dbeaa 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -197,7 +197,7 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
        struct ubifs_ino_node *ino = buf;
        struct ubifs_scan_node *snod;
-        snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
+        snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
        if (!snod)
                return -ENOMEM;
@@ -212,13 +212,15 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
        case UBIFS_DENT_NODE:
        case UBIFS_XENT_NODE:
        case UBIFS_DATA_NODE:
-        case UBIFS_TRUN_NODE:
                /*
                 * The key is in the same place in all keyed
                 * nodes.
                 */
                key_read(c, &ino->key, &snod->key);
                break;
+        default:
+                invalid_key_init(c, &snod->key);
+                break;
        }
        list_add_tail(&snod->list, &sleb->nodes);
        sleb->nodes_cnt += 1;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 0b201114a5ad..46961c003236 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -250,7 +250,7 @@ static int kick_a_thread(void)
                        dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
                        if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
-                            c->ro_media) {
+                            c->ro_mount || c->ro_error) {
                                mutex_unlock(&c->umount_mutex);
                                continue;
                        }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index cd5900b85d38..6e11c2975dcf 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -272,12 +272,20 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
        return &ui->vfs_inode;
 };
+static void ubifs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ubifs_inode_slab, ui);
+}
 static void ubifs_destroy_inode(struct inode *inode)
 {
        struct ubifs_inode *ui = ubifs_inode(inode);
        kfree(ui->data);
-        kmem_cache_free(ubifs_inode_slab, inode);
+        call_rcu(&inode->i_rcu, ubifs_i_callback);
 }
 /*
@@ -1137,11 +1145,11 @@ static int check_free_space(struct ubifs_info *c)
 */
 static int mount_ubifs(struct ubifs_info *c)
 {
-        struct super_block *sb = c->vfs_sb;
+        int err;
-        int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
        long long x;
        size_t sz;
+        c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
        err = init_constants_early(c);
        if (err)
                return err;
@@ -1154,7 +1162,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_free;
-        if (c->empty && (mounted_read_only || c->ro_media)) {
+        if (c->empty && (c->ro_mount || c->ro_media)) {
                /*
                 * This UBI volume is empty, and read-only, or the file system
                 * is mounted read-only - we cannot format it.
@@ -1165,7 +1173,7 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_free;
        }
-        if (c->ro_media && !mounted_read_only) {
+        if (c->ro_media && !c->ro_mount) {
                ubifs_err("cannot mount read-write - read-only media");
                err = -EROFS;
                goto out_free;
@@ -1185,7 +1193,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (!c->sbuf)
                goto out_free;
-        if (!mounted_read_only) {
+        if (!c->ro_mount) {
                c->ileb_buf = vmalloc(c->leb_size);
                if (!c->ileb_buf)
                        goto out_free;
@@ -1228,7 +1236,7 @@ static int mount_ubifs(struct ubifs_info *c)
        }
        sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
-        if (!mounted_read_only) {
+        if (!c->ro_mount) {
                err = alloc_wbufs(c);
                if (err)
                        goto out_cbuf;
@@ -1254,12 +1262,12 @@ static int mount_ubifs(struct ubifs_info *c)
        if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
                ubifs_msg("recovery needed");
                c->need_recovery = 1;
-                if (!mounted_read_only) {
+                if (!c->ro_mount) {
                        err = ubifs_recover_inl_heads(c, c->sbuf);
                        if (err)
                                goto out_master;
                }
-        } else if (!mounted_read_only) {
+        } else if (!c->ro_mount) {
                /*
                 * Set the "dirty" flag so that if we reboot uncleanly we
                 * will notice this immediately on the next mount.
@@ -1270,7 +1278,7 @@ static int mount_ubifs(struct ubifs_info *c)
                        goto out_master;
        }
-        err = ubifs_lpt_init(c, 1, !mounted_read_only);
+        err = ubifs_lpt_init(c, 1, !c->ro_mount);
        if (err)
                goto out_lpt;
@@ -1285,11 +1293,11 @@ static int mount_ubifs(struct ubifs_info *c)
        /* Calculate 'min_idx_lebs' after journal replay */
        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
-        err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
+        err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
        if (err)
                goto out_orphans;
-        if (!mounted_read_only) {
+        if (!c->ro_mount) {
                int lnum;
                err = check_free_space(c);
@@ -1351,7 +1359,7 @@ static int mount_ubifs(struct ubifs_info *c)
        spin_unlock(&ubifs_infos_lock);
        if (c->need_recovery) {
-                if (mounted_read_only)
+                if (c->ro_mount)
                        ubifs_msg("recovery deferred");
                else {
                        c->need_recovery = 0;
@@ -1378,7 +1386,7 @@ static int mount_ubifs(struct ubifs_info *c)
        ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
                  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
-        if (mounted_read_only)
+        if (c->ro_mount)
                ubifs_msg("mounted read-only");
        x = (long long)c->main_lebs * c->leb_size;
        ubifs_msg("file system size:   %lld bytes (%lld KiB, %lld MiB, %d "
@@ -1640,7 +1648,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        }
        dbg_gen("re-mounted read-write");
-        c->vfs_sb->s_flags &= ~MS_RDONLY;
+        c->ro_mount = 0;
        c->remounting_rw = 0;
        c->always_chk_crc = 0;
        err = dbg_check_space_info(c);
@@ -1676,7 +1684,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        int i, err;
        ubifs_assert(!c->need_recovery);
-        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
+        ubifs_assert(!c->ro_mount);
        mutex_lock(&c->umount_mutex);
        if (c->bgt) {
@@ -1686,10 +1694,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        dbg_save_space_info(c);
-        for (i = 0; i < c->jhead_cnt; i++) {
+        for (i = 0; i < c->jhead_cnt; i++)
                ubifs_wbuf_sync(&c->jheads[i].wbuf);
-                hrtimer_cancel(&c->jheads[i].wbuf.timer);
-        }
        c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
        c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
@@ -1704,6 +1710,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        vfree(c->ileb_buf);
        c->ileb_buf = NULL;
        ubifs_lpt_free(c, 1);
+        c->ro_mount = 1;
        err = dbg_check_space_info(c);
        if (err)
                ubifs_ro_mode(c, err);
@@ -1735,7 +1742,7 @@ static void ubifs_put_super(struct super_block *sb)
         * the mutex is locked.
         */
        mutex_lock(&c->umount_mutex);
-        if (!(c->vfs_sb->s_flags & MS_RDONLY)) {
+        if (!c->ro_mount) {
                /*
                 * First of all kill the background thread to make sure it does
                 * not interfere with un-mounting and freeing resources.
@@ -1745,23 +1752,22 @@ static void ubifs_put_super(struct super_block *sb)
                        c->bgt = NULL;
                }
-                /* Synchronize write-buffers */
-                if (c->jheads)
-                        for (i = 0; i < c->jhead_cnt; i++)
-                                ubifs_wbuf_sync(&c->jheads[i].wbuf);
                /*
-                 * On fatal errors c->ro_media is set to 1, in which case we do
+                 * On fatal errors c->ro_error is set to 1, in which case we do
                 * not write the master node.
                 */
-                if (!c->ro_media) {
+                if (!c->ro_error) {
+                        int err;
+                        /* Synchronize write-buffers */
+                        for (i = 0; i < c->jhead_cnt; i++)
+                                ubifs_wbuf_sync(&c->jheads[i].wbuf);
                        /*
                         * We are being cleanly unmounted which means the
                         * orphans were killed - indicate this in the master
                         * node. Also save the reserved GC LEB number.
                         */
-                        int err;
                        c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
                        c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
                        c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
@@ -1774,6 +1780,10 @@ static void ubifs_put_super(struct super_block *sb)
                                 */
                                ubifs_err("failed to write master node, "
                                          "error %d", err);
+                } else {
+                        for (i = 0; i < c->jhead_cnt; i++)
+                                /* Make sure write-buffer timers are canceled */
+                                hrtimer_cancel(&c->jheads[i].wbuf.timer);
                }
        }
@@ -1797,17 +1807,21 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
                return err;
        }
-        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+        if (c->ro_mount && !(*flags & MS_RDONLY)) {
+                if (c->ro_error) {
+                        ubifs_msg("cannot re-mount R/W due to prior errors");
+                        return -EROFS;
+                }
                if (c->ro_media) {
-                        ubifs_msg("cannot re-mount due to prior errors");
+                        ubifs_msg("cannot re-mount R/W - UBI volume is R/O");
                        return -EROFS;
                }
                err = ubifs_remount_rw(c);
                if (err)
                        return err;
-        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
+        } else if (!c->ro_mount && (*flags & MS_RDONLY)) {
-                if (c->ro_media) {
+                if (c->ro_error) {
-                        ubifs_msg("cannot re-mount due to prior errors");
+                        ubifs_msg("cannot re-mount R/O due to prior errors");
                        return -EROFS;
                }
                ubifs_remount_ro(c);
@@ -2032,8 +2046,8 @@ static int sb_test(struct super_block *sb, void *data)
        return c->vi.cdev == *dev;
 }
-static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
-                        const char *name, void *data, struct vfsmount *mnt)
+                        const char *name, void *data)
 {
        struct ubi_volume_desc *ubi;
        struct ubi_volume_info vi;
@@ -2049,9 +2063,9 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
         */
        ubi = open_ubi(name, UBI_READONLY);
        if (IS_ERR(ubi)) {
-                ubifs_err("cannot open \"%s\", error %d",
+                dbg_err("cannot open \"%s\", error %d",
-                          name, (int)PTR_ERR(ubi));
+                        name, (int)PTR_ERR(ubi));
-                return PTR_ERR(ubi);
+                return ERR_CAST(ubi);
        }
        ubi_get_volume_info(ubi, &vi);
@@ -2064,9 +2078,11 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
        }
        if (sb->s_root) {
+                struct ubifs_info *c1 = sb->s_fs_info;
                /* A new mount point for already mounted UBIFS */
                dbg_gen("this ubi volume is already mounted");
-                if ((flags ^ sb->s_flags) & MS_RDONLY) {
+                if (!!(flags & MS_RDONLY) != c1->ro_mount) {
                        err = -EBUSY;
                        goto out_deact;
                }
@@ -2087,20 +2103,19 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
        /* 'fill_super()' opens ubi again so we must close it here */
        ubi_close_volume(ubi);
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 out_deact:
        deactivate_locked_super(sb);
 out_close:
        ubi_close_volume(ubi);
-        return err;
+        return ERR_PTR(err);
 }
 static struct file_system_type ubifs_fs_type = {
        .name    = "ubifs",
        .owner   = THIS_MODULE,
-        .get_sb  = ubifs_get_sb,
+        .mount   = ubifs_mount,
        .kill_sb = kill_anon_super,
 };
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 2194915220e5..ad9cf0133622 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1177,6 +1177,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
        unsigned long time = get_seconds();
        dbg_tnc("search key %s", DBGKEY(key));
+        ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
        znode = c->zroot.znode;
        if (unlikely(!znode)) {
@@ -2966,7 +2967,7 @@ static struct ubifs_znode *right_znode(struct ubifs_info *c,
 *
 * This function searches an indexing node by its first key @key and its
 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
- * nodes it traverses to TNC. This function is called fro indexing nodes which
+ * nodes it traverses to TNC. This function is called for indexing nodes which
 * were found on the media by scanning, for example when garbage-collecting or
 * when doing in-the-gaps commit. This means that the indexing node which is
 * looked for does not have to have exactly the same leftmost key @key, because
@@ -2988,6 +2989,8 @@ static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
        struct ubifs_znode *znode, *zn;
        int n, nn;
+        ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
        /*
         * The arguments have probably been read off flash, so don't assume
         * they are valid.
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0c9876b396dd..381d6b207a52 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -119,8 +119,12 @@
 * in TNC. However, when replaying, it is handy to introduce fake "truncation"
 * keys for truncation nodes because the code becomes simpler. So we define
 * %UBIFS_TRUN_KEY type.
+ *
+ * But otherwise, out of the journal reply scope, the truncation keys are
+ * invalid.
 */
-#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
+#define UBIFS_TRUN_KEY    UBIFS_KEY_TYPES_CNT
+#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT
 /*
 * How much a directory entry/extended attribute entry adds to the parent/host
@@ -1028,6 +1032,8 @@ struct ubifs_debug_info;
 * @max_leb_cnt: maximum count of logical eraseblocks
 * @old_leb_cnt: count of logical eraseblocks before re-size
 * @ro_media: the underlying UBI volume is read-only
+ * @ro_mount: the file-system was mounted as read-only
+ * @ro_error: UBIFS switched to R/O mode because an error happened
 *
 * @dirty_pg_cnt: number of dirty pages (not used)
 * @dirty_zn_cnt: number of dirty znodes
@@ -1168,11 +1174,14 @@ struct ubifs_debug_info;
 * @replay_sqnum: sequence number of node currently being replayed
 * @need_recovery: file-system needs recovery
 * @replaying: set to %1 during journal replay
- * @unclean_leb_list: LEBs to recover when mounting ro to rw
+ * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
- * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
+ *                    mode
+ * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
+ *                  FS to R/W mode
 * @size_tree: inode size information for recovery
- * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
+ * @remounting_rw: set while re-mounting from R/O mode to R/W mode
- * @always_chk_crc: always check CRCs (while mounting and remounting rw)
+ * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
+ *                  mode)
 * @mount_opts: UBIFS-specific mount options
 *
 * @dbg: debugging-related information
@@ -1268,7 +1277,9 @@ struct ubifs_info {
        int leb_cnt;
        int max_leb_cnt;
        int old_leb_cnt;
-        int ro_media;
+        unsigned int ro_media:1;
+        unsigned int ro_mount:1;
+        unsigned int ro_error:1;
        atomic_long_t dirty_pg_cnt;
        atomic_long_t dirty_zn_cnt;
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b608efaa4cee..306ee39ef2c3 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -157,10 +157,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                                udf_debug("bit %ld already set\n", bit + i);
                                udf_debug("byte=%2x\n",
                                        ((char *)bh->b_data)[(bit + i) >> 3]);
-                        } else {
-                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
+                udf_add_free_space(sb, sbi->s_partition, count);
                mark_buffer_dirty(bh);
                if (overflow) {
                        block += count;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 51552bf50225..eb8bfe2b89a5 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,7 +30,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
@@ -190,18 +189,14 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode *dir = filp->f_path.dentry->d_inode;
        int result;
-        lock_kernel();
        if (filp->f_pos == 0) {
                if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
-                        unlock_kernel();
                        return 0;
                }
                filp->f_pos++;
        }
        result = do_udf_readdir(dir, filp, filldir, dirent);
-        unlock_kernel();
        return result;
 }
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 66b9e7e7e4c5..89c78486cbbe 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -32,7 +32,6 @@
 #include <linux/string.h> /* memset */
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
@@ -114,6 +113,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        size_t count = iocb->ki_left;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                if (file->f_flags & O_APPEND)
                        pos = inode->i_size;
@@ -126,6 +126,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                        udf_expand_file_adinicb(inode, pos + count, &err);
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
+                                up_write(&iinfo->i_data_sem);
                                return err;
                        }
                } else {
@@ -135,6 +136,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                iinfo->i_lenAlloc = inode->i_size;
                }
        }
+        up_write(&iinfo->i_data_sem);
        retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
        if (retval > 0)
@@ -149,8 +151,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        long old_block, new_block;
        int result = -EINVAL;
-        lock_kernel();
        if (file_permission(filp, MAY_READ) != 0) {
                udf_debug("no permission to access inode %lu\n", inode->i_ino);
                result = -EPERM;
@@ -196,7 +196,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        }
 out:
-        unlock_kernel();
        return result;
 }
@@ -204,10 +203,10 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->f_mode & FMODE_WRITE) {
                mutex_lock(&inode->i_mutex);
-                lock_kernel();
+                down_write(&UDF_I(inode)->i_data_sem);
                udf_discard_prealloc(inode);
                udf_truncate_tail_extent(inode);
-                unlock_kernel();
+                up_write(&UDF_I(inode)->i_data_sem);
                mutex_unlock(&inode->i_mutex);
        }
        return 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 75d9304d0dc3..6fb7e0adcda0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -92,28 +92,19 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                return NULL;
        }
-        mutex_lock(&sbi->s_alloc_mutex);
        if (sbi->s_lvid_bh) {
-                struct logicalVolIntegrityDesc *lvid =
+                struct logicalVolIntegrityDescImpUse *lvidiu;
-                        (struct logicalVolIntegrityDesc *)
-                        sbi->s_lvid_bh->b_data;
+                iinfo->i_unique = lvid_get_unique_id(sb);
-                struct logicalVolIntegrityDescImpUse *lvidiu =
+                mutex_lock(&sbi->s_alloc_mutex);
-                                                        udf_sb_lvidiu(sbi);
+                lvidiu = udf_sb_lvidiu(sbi);
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                (lvid->logicalVolContentsUse);
                if (S_ISDIR(mode))
                        le32_add_cpu(&lvidiu->numDirs, 1);
                else
                        le32_add_cpu(&lvidiu->numFiles, 1);
-                iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
                udf_updated_lvid(sb);
+                mutex_unlock(&sbi->s_alloc_mutex);
        }
-        mutex_unlock(&sbi->s_alloc_mutex);
        inode_init_owner(inode, dir, mode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fc48f37aa2dd..c6a2e782b97b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -31,7 +31,6 @@
 #include "udfdecl.h"
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
@@ -51,6 +50,7 @@ MODULE_LICENSE("GPL");
 static mode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
 static void udf_fill_inode(struct inode *, struct buffer_head *);
+static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
                                        sector_t *, int *);
@@ -79,9 +79,7 @@ void udf_evict_inode(struct inode *inode)
                want_delete = 1;
                inode->i_size = 0;
                udf_truncate(inode);
-                lock_kernel();
                udf_update_inode(inode, IS_SYNC(inode));
-                unlock_kernel();
        }
        invalidate_inode_buffers(inode);
        end_writeback(inode);
@@ -97,9 +95,7 @@ void udf_evict_inode(struct inode *inode)
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
        if (want_delete) {
-                lock_kernel();
                udf_free_inode(inode);
-                unlock_kernel();
        }
 }
@@ -302,10 +298,9 @@ static int udf_get_block(struct inode *inode, sector_t block,
        err = -EIO;
        new = 0;
        bh = NULL;
-        lock_kernel();
        iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        if (block == iinfo->i_next_alloc_block + 1) {
                iinfo->i_next_alloc_block++;
                iinfo->i_next_alloc_goal++;
@@ -324,7 +319,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
        map_bh(bh_result, inode->i_sb, phys);
 abort:
-        unlock_kernel();
+        up_write(&iinfo->i_data_sem);
        return err;
 }
@@ -1022,16 +1017,16 @@ void udf_truncate(struct inode *inode)
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        lock_kernel();
        iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                down_write(&iinfo->i_data_sem);
                if (inode->i_sb->s_blocksize <
                                (udf_file_entry_alloc_offset(inode) +
                                 inode->i_size)) {
                        udf_expand_file_adinicb(inode, inode->i_size, &err);
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                                inode->i_size = iinfo->i_lenAlloc;
-                                unlock_kernel();
+                                up_write(&iinfo->i_data_sem);
                                return;
                        } else
                                udf_truncate_extents(inode);
@@ -1042,10 +1037,13 @@ void udf_truncate(struct inode *inode)
                                offset - udf_file_entry_alloc_offset(inode));
                        iinfo->i_lenAlloc = inode->i_size;
                }
+                up_write(&iinfo->i_data_sem);
        } else {
                block_truncate_page(inode->i_mapping, inode->i_size,
                                    udf_get_block);
+                down_write(&iinfo->i_data_sem);
                udf_truncate_extents(inode);
+                up_write(&iinfo->i_data_sem);
        }
        inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
@@ -1053,7 +1051,6 @@ void udf_truncate(struct inode *inode)
                udf_sync_inode(inode);
        else
                mark_inode_dirty(inode);
-        unlock_kernel();
 }
 static void __udf_read_inode(struct inode *inode)
@@ -1202,6 +1199,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                return;
        }
+        read_lock(&sbi->s_cred_lock);
        inode->i_uid = le32_to_cpu(fe->uid);
        if (inode->i_uid == -1 ||
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
@@ -1214,13 +1212,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
                inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
-        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
-        if (!inode->i_nlink)
-                inode->i_nlink = 1;
-        inode->i_size = le64_to_cpu(fe->informationLength);
-        iinfo->i_lenExtents = inode->i_size;
        if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
                        sbi->s_fmode != UDF_INVALID_MODE)
                inode->i_mode = sbi->s_fmode;
@@ -1230,6 +1221,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        else
                inode->i_mode = udf_convert_permissions(fe);
        inode->i_mode &= ~sbi->s_umask;
+        read_unlock(&sbi->s_cred_lock);
+        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
+        if (!inode->i_nlink)
+                inode->i_nlink = 1;
+        inode->i_size = le64_to_cpu(fe->informationLength);
+        iinfo->i_lenExtents = inode->i_size;
        if (iinfo->i_efe == 0) {
                inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1373,16 +1372,10 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
 int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        int ret;
+        return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        lock_kernel();
-        ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
-        return ret;
 }
-int udf_sync_inode(struct inode *inode)
+static int udf_sync_inode(struct inode *inode)
 {
        return udf_update_inode(inode, 1);
 }
@@ -2048,7 +2041,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        struct extent_position epos = {};
        int ret;
-        lock_kernel();
+        down_read(&UDF_I(inode)->i_data_sem);
        if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
                                                (EXT_RECORDED_ALLOCATED >> 30))
@@ -2056,7 +2049,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        else
                ret = 0;
-        unlock_kernel();
+        up_read(&UDF_I(inode)->i_data_sem);
        brelse(epos.bh);
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc674193c..2be0f9eb86d2 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/crc-itu-t.h>
@@ -228,10 +227,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                }
                if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
-                    isdotdot) {
+                    isdotdot)
-                        brelse(epos.bh);
+                        goto out_ok;
-                        return fi;
-                }
                if (!lfi)
                        continue;
@@ -263,7 +260,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
        if (dentry->d_name.len > UDF_NAME_LEN - 2)
                return ERR_PTR(-ENAMETOOLONG);
-        lock_kernel();
 #ifdef UDF_RECOVERY
        /* temporary shorthand for specifying files by inode number */
        if (!strncmp(dentry->d_name.name, ".B=", 3)) {
@@ -275,7 +271,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                };
                inode = udf_iget(dir->i_sb, lb);
                if (!inode) {
-                        unlock_kernel();
                        return ERR_PTR(-EACCES);
                }
        } else
@@ -291,11 +286,9 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                loc = lelb_to_cpu(cfi.icb.extLocation);
                inode = udf_iget(dir->i_sb, &loc);
                if (!inode) {
-                        unlock_kernel();
                        return ERR_PTR(-EACCES);
                }
        }
-        unlock_kernel();
        return d_splice_alias(inode, dentry);
 }
@@ -476,15 +469,19 @@ add:
                                f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
                if (!fibh->ebh)
                        goto out_err;
+                /* Extents could have been merged, invalidate our position */
+                brelse(epos.bh);
+                epos.bh = NULL;
+                epos.block = dinfo->i_location;
+                epos.offset = udf_file_entry_alloc_offset(dir);
                if (!fibh->soffset) {
-                        if (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+                        /* Find the freshly allocated block */
-                            (EXT_RECORDED_ALLOCATED >> 30)) {
+                        while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
-                                block = eloc.logicalBlockNum + ((elen - 1) >>
+                                (EXT_RECORDED_ALLOCATED >> 30))
+                                ;
+                        block = eloc.logicalBlockNum + ((elen - 1) >>
                                        dir->i_sb->s_blocksize_bits);
-                        } else
-                                block++;
                        brelse(fibh->sbh);
                        fibh->sbh = fibh->ebh;
                        fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
@@ -562,10 +559,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        struct udf_inode_info *iinfo;
-        lock_kernel();
        inode = udf_new_inode(dir, mode, &err);
        if (!inode) {
-                unlock_kernel();
                return err;
        }
@@ -583,7 +578,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_nlink--;
                mark_inode_dirty(inode);
                iput(inode);
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -596,7 +590,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
-        unlock_kernel();
        d_instantiate(dentry, inode);
        return 0;
@@ -614,7 +607,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        lock_kernel();
        err = -EIO;
        inode = udf_new_inode(dir, mode, &err);
        if (!inode)
@@ -627,7 +619,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_nlink--;
                mark_inode_dirty(inode);
                iput(inode);
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -646,7 +637,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -659,7 +649,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        lock_kernel();
        err = -EMLINK;
        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
                goto out;
@@ -712,7 +701,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -794,7 +782,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct kernel_lb_addr tloc;
        retval = -ENOENT;
-        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
                goto out;
@@ -826,7 +813,6 @@ end_rmdir:
        brelse(fibh.sbh);
 out:
-        unlock_kernel();
        return retval;
 }
@@ -840,7 +826,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct kernel_lb_addr tloc;
        retval = -ENOENT;
-        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
                goto out;
@@ -870,7 +855,6 @@ end_unlink:
        brelse(fibh.sbh);
 out:
-        unlock_kernel();
        return retval;
 }
@@ -890,21 +874,21 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        int block;
        unsigned char *name = NULL;
        int namelen;
-        struct buffer_head *bh;
        struct udf_inode_info *iinfo;
+        struct super_block *sb = dir->i_sb;
-        lock_kernel();
        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
        if (!inode)
                goto out;
+        iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
        if (!name) {
                err = -ENOMEM;
                goto out_no_entry;
        }
-        iinfo = UDF_I(inode);
        inode->i_data.a_ops = &udf_symlink_aops;
        inode->i_op = &udf_symlink_inode_operations;
@@ -912,7 +896,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                struct kernel_lb_addr eloc;
                uint32_t bsize;
-                block = udf_new_block(inode->i_sb, inode,
+                block = udf_new_block(sb, inode,
                                iinfo->i_location.partitionReferenceNum,
                                iinfo->i_location.logicalBlockNum, &err);
                if (!block)
@@ -923,17 +907,17 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                eloc.logicalBlockNum = block;
                eloc.partitionReferenceNum =
                                iinfo->i_location.partitionReferenceNum;
-                bsize = inode->i_sb->s_blocksize;
+                bsize = sb->s_blocksize;
                iinfo->i_lenExtents = bsize;
                udf_add_aext(inode, &epos, &eloc, bsize, 0);
                brelse(epos.bh);
-                block = udf_get_pblock(inode->i_sb, block,
+                block = udf_get_pblock(sb, block,
                                iinfo->i_location.partitionReferenceNum,
                                0);
-                epos.bh = udf_tgetblk(inode->i_sb, block);
+                epos.bh = udf_tgetblk(sb, block);
                lock_buffer(epos.bh);
-                memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize);
+                memset(epos.bh->b_data, 0x00, bsize);
                set_buffer_uptodate(epos.bh);
                unlock_buffer(epos.bh);
                mark_buffer_dirty_inode(epos.bh, inode);
@@ -941,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        } else
                ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
-        eoffset = inode->i_sb->s_blocksize - udf_ext0_offset(inode);
+        eoffset = sb->s_blocksize - udf_ext0_offset(inode);
        pc = (struct pathComponent *)ea;
        if (*symname == '/') {
@@ -981,7 +965,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                }
                if (pc->componentType == 5) {
-                        namelen = udf_put_filename(inode->i_sb, compstart, name,
+                        namelen = udf_put_filename(sb, compstart, name,
                                                   symname - compstart);
                        if (!namelen)
                                goto out_no_entry;
@@ -1015,27 +999,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi)
                goto out_no_entry;
-        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+        cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-        bh = UDF_SB(inode->i_sb)->s_lvid_bh;
+        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-        if (bh) {
-                struct logicalVolIntegrityDesc *lvid =
-                                (struct logicalVolIntegrityDesc *)bh->b_data;
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                lvid->logicalVolContentsUse;
-                uniqueID = le64_to_cpu(lvhd->uniqueID);
                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
+                        cpu_to_le32(lvid_get_unique_id(sb));
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(bh);
        }
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                mark_inode_dirty(dir);
+        up_write(&iinfo->i_data_sem);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
@@ -1044,10 +1017,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 out:
        kfree(name);
-        unlock_kernel();
        return err;
 out_no_entry:
+        up_write(&iinfo->i_data_sem);
        inode_dec_link_count(inode);
        iput(inode);
        goto out;
@@ -1060,36 +1033,20 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        struct udf_fileident_bh fibh;
        struct fileIdentDesc cfi, *fi;
        int err;
-        struct buffer_head *bh;
-        lock_kernel();
        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
-                unlock_kernel();
                return -EMLINK;
        }
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
-        bh = UDF_SB(inode->i_sb)->s_lvid_bh;
+        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-        if (bh) {
-                struct logicalVolIntegrityDesc *lvid =
-                                (struct logicalVolIntegrityDesc *)bh->b_data;
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                (lvid->logicalVolContentsUse);
-                uniqueID = le64_to_cpu(lvhd->uniqueID);
                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
+                        cpu_to_le32(lvid_get_unique_id(inode->i_sb));
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(bh);
        }
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
@@ -1101,9 +1058,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        inc_nlink(inode);
        inode->i_ctime = current_fs_time(inode->i_sb);
        mark_inode_dirty(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
-        unlock_kernel();
        return 0;
 }
@@ -1124,7 +1080,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
-        lock_kernel();
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
                if (ofibh.sbh != ofibh.ebh)
@@ -1248,7 +1203,6 @@ end_rename:
                        brelse(nfibh.ebh);
                brelse(nfibh.sbh);
        }
-        unlock_kernel();
        return retval;
 }
@@ -1261,7 +1215,6 @@ static struct dentry *udf_get_parent(struct dentry *child)
        struct fileIdentDesc cfi;
        struct udf_fileident_bh fibh;
-        lock_kernel();
        if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
                goto out_unlock;
@@ -1273,11 +1226,9 @@ static struct dentry *udf_get_parent(struct dentry *child)
        inode = udf_iget(child->d_inode->i_sb, &tloc);
        if (!inode)
                goto out_unlock;
-        unlock_kernel();
        return d_obtain_alias(inode);
 out_unlock:
-        unlock_kernel();
        return ERR_PTR(-EACCES);
 }
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 745eb209be0c..a71090ea0e07 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/mutex.h>
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
                        uint16_t partition, uint32_t offset)
@@ -159,7 +160,9 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
        struct udf_sb_info *sbi = UDF_SB(sb);
        u16 reallocationTableLen;
        struct buffer_head *bh;
+        int ret = 0;
+        mutex_lock(&sbi->s_alloc_mutex);
        for (i = 0; i < sbi->s_partitions; i++) {
                struct udf_part_map *map = &sbi->s_partmaps[i];
                if (old_block > map->s_partition_root &&
@@ -175,8 +178,10 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                        break;
                                }
-                        if (!st)
+                        if (!st) {
-                                return 1;
+                                ret = 1;
+                                goto out;
+                        }
                        reallocationTableLen =
                                        le16_to_cpu(st->reallocationTableLen);
@@ -207,14 +212,16 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
-                                        return 0;
+                                        ret = 0;
+                                        goto out;
                                } else if (origLoc == packet) {
                                        *new_block = le32_to_cpu(
                                                        entry->mappedLocation) +
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
-                                        return 0;
+                                        ret = 0;
+                                        goto out;
                                } else if (origLoc > packet)
                                        break;
                        }
@@ -251,20 +258,24 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                              st->mapEntry[k].mappedLocation) +
                                        ((old_block - map->s_partition_root) &
                                         (sdata->s_packet_len - 1));
-                                return 0;
+                                ret = 0;
+                                goto out;
                        }
-                        return 1;
+                        ret = 1;
+                        goto out;
                } /* if old_block */
        }
        if (i == sbi->s_partitions) {
                /* outside of partitions */
                /* for now, fail =) */
-                return 1;
+                ret = 1;
        }
-        return 0;
+out:
+        mutex_unlock(&sbi->s_alloc_mutex);
+        return ret;
 }
 static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 65412d84a45d..7b27b063ff6d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -48,7 +48,6 @@
 #include <linux/stat.h>
 #include <linux/cdrom.h>
 #include <linux/nls.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/vmalloc.h>
@@ -107,17 +106,16 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
 }
 /* UDF filesystem type */
-static int udf_get_sb(struct file_system_type *fs_type,
+static struct dentry *udf_mount(struct file_system_type *fs_type,
-                      int flags, const char *dev_name, void *data,
+                      int flags, const char *dev_name, void *data)
-                      struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
 }
 static struct file_system_type udf_fstype = {
        .owner          = THIS_MODULE,
        .name           = "udf",
-        .get_sb         = udf_get_sb,
+        .mount          = udf_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -136,15 +134,23 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
        ei->i_next_alloc_block = 0;
        ei->i_next_alloc_goal = 0;
        ei->i_strat4096 = 0;
+        init_rwsem(&ei->i_data_sem);
        return &ei->vfs_inode;
 }
-static void udf_destroy_inode(struct inode *inode)
+static void udf_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
+static void udf_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, udf_i_callback);
+}
 static void init_once(void *foo)
 {
        struct udf_inode_info *ei = (struct udf_inode_info *)foo;
@@ -568,13 +574,14 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
-        lock_kernel();
+        write_lock(&sbi->s_cred_lock);
        sbi->s_flags = uopt.flags;
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
        sbi->s_umask = uopt.umask;
        sbi->s_fmode = uopt.fmode;
        sbi->s_dmode = uopt.dmode;
+        write_unlock(&sbi->s_cred_lock);
        if (sbi->s_lvid_bh) {
                int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -591,7 +598,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                udf_open_lvid(sb);
 out_unlock:
-        unlock_kernel();
        return error;
 }
@@ -960,9 +966,9 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                (sizeof(struct buffer_head *) * nr_groups);
        if (size <= PAGE_SIZE)
-                bitmap = kmalloc(size, GFP_KERNEL);
+                bitmap = kzalloc(size, GFP_KERNEL);
        else
-                bitmap = vmalloc(size); /* TODO: get rid of vmalloc */
+                bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
        if (bitmap == NULL) {
                udf_error(sb, __func__,
@@ -971,7 +977,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                return NULL;
        }
-        memset(bitmap, 0x00, size);
        bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
        bitmap->s_nr_groups = nr_groups;
        return bitmap;
@@ -1775,6 +1780,8 @@ static void udf_open_lvid(struct super_block *sb)
        if (!bh)
                return;
+        mutex_lock(&sbi->s_alloc_mutex);
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
@@ -1791,6 +1798,7 @@ static void udf_open_lvid(struct super_block *sb)
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
+        mutex_unlock(&sbi->s_alloc_mutex);
 }
 static void udf_close_lvid(struct super_block *sb)
@@ -1803,6 +1811,7 @@ static void udf_close_lvid(struct super_block *sb)
        if (!bh)
                return;
+        mutex_lock(&sbi->s_alloc_mutex);
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1823,6 +1832,34 @@ static void udf_close_lvid(struct super_block *sb)
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
+        mutex_unlock(&sbi->s_alloc_mutex);
+}
+u64 lvid_get_unique_id(struct super_block *sb)
+{
+        struct buffer_head *bh;
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        struct logicalVolIntegrityDesc *lvid;
+        struct logicalVolHeaderDesc *lvhd;
+        u64 uniqueID;
+        u64 ret;
+        bh = sbi->s_lvid_bh;
+        if (!bh)
+                return 0;
+        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+        lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
+        mutex_lock(&sbi->s_alloc_mutex);
+        ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
+        if (!(++uniqueID & 0xFFFFFFFF))
+                uniqueID += 16;
+        lvhd->uniqueID = cpu_to_le64(uniqueID);
+        mutex_unlock(&sbi->s_alloc_mutex);
+        mark_buffer_dirty(bh);
+        return ret;
 }
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1926,6 +1963,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sbi->s_fmode = uopt.fmode;
        sbi->s_dmode = uopt.dmode;
        sbi->s_nls_map = uopt.nls_map;
+        rwlock_init(&sbi->s_cred_lock);
        if (uopt.session == 0xFFFFFFFF)
                sbi->s_session = udf_get_last_session(sb);
@@ -2093,8 +2131,6 @@ static void udf_put_super(struct super_block *sb)
        sbi = UDF_SB(sb);
-        lock_kernel();
        if (sbi->s_vat_inode)
                iput(sbi->s_vat_inode);
        if (sbi->s_partitions)
@@ -2110,8 +2146,6 @@ static void udf_put_super(struct super_block *sb)
        kfree(sbi->s_partmaps);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int udf_sync_fs(struct super_block *sb, int wait)
@@ -2174,8 +2208,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        uint16_t ident;
        struct spaceBitmapDesc *bm;
-        lock_kernel();
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
        bh = udf_read_ptagged(sb, &loc, 0, &ident);
@@ -2212,10 +2244,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
                }
        }
        brelse(bh);
 out:
-        unlock_kernel();
        return accum;
 }
@@ -2228,8 +2257,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
        int8_t etype;
        struct extent_position epos;
-        lock_kernel();
+        mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
        epos.block = UDF_I(table)->i_location;
        epos.offset = sizeof(struct unallocSpaceEntry);
        epos.bh = NULL;
@@ -2238,8 +2266,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
                accum += (elen >> table->i_sb->s_blocksize_bits);
        brelse(epos.bh);
+        mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
-        unlock_kernel();
        return accum;
 }
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 16064787d2b7..b1d4488b0f14 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -27,7 +27,6 @@
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
@@ -78,13 +77,16 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        int err = -EIO;
        unsigned char *p = kmap(page);
        struct udf_inode_info *iinfo;
+        uint32_t pos;
-        lock_kernel();
        iinfo = UDF_I(inode);
+        pos = udf_block_map(inode, 0);
+        down_read(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
        } else {
-                bh = sb_bread(inode->i_sb, udf_block_map(inode, 0));
+                bh = sb_bread(inode->i_sb, pos);
                if (!bh)
                        goto out;
@@ -95,14 +97,14 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
        brelse(bh);
-        unlock_kernel();
+        up_read(&iinfo->i_data_sem);
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
        return 0;
 out:
-        unlock_kernel();
+        up_read(&iinfo->i_data_sem);
        SetPageError(page);
        kunmap(page);
        unlock_page(page);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index e58d1de41073..d1bd31ea724e 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,18 @@
 #ifndef _UDF_I_H
 #define _UDF_I_H
+/*
+ * The i_data_sem and i_mutex serve for protection of allocation information
+ * of a regular files and symlinks. This includes all extents belonging to
+ * the file/symlink, a fact whether data are in-inode or in external data
+ * blocks, preallocation, goal block information... When extents are read,
+ * i_mutex or i_data_sem must be held (for reading is enough in case of
+ * i_data_sem). When extents are changed, i_data_sem must be held for writing
+ * and also i_mutex must be held.
+ *
+ * For directories i_mutex is used for all the necessary protection.
+ */
 struct udf_inode_info {
        struct timespec         i_crtime;
        /* Physical address of inode */
@@ -21,6 +33,7 @@ struct udf_inode_info {
                struct long_ad          *i_lad;
                __u8            *i_data;
        } i_ext;
+        struct rw_semaphore     i_data_sem;
        struct inode vfs_inode;
 };
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index d113b72c2768..4858c191242b 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -2,6 +2,7 @@
 #define __LINUX_UDF_SB_H
 #include <linux/mutex.h>
+#include <linux/bitops.h>
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC                 0x15013346
@@ -128,6 +129,8 @@ struct udf_sb_info {
        uid_t                   s_uid;
        mode_t                  s_fmode;
        mode_t                  s_dmode;
+        /* Lock protecting consistency of above permission settings */
+        rwlock_t                s_cred_lock;
        /* Root Info */
        struct timespec         s_record_time;
@@ -139,7 +142,7 @@ struct udf_sb_info {
        __u16                   s_udfrev;
        /* Miscellaneous flags */
-        __u32                   s_flags;
+        unsigned long           s_flags;
        /* Encoding info */
        struct nls_table        *s_nls_map;
@@ -161,8 +164,19 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
 int udf_compute_nr_groups(struct super_block *sb, u32 partition);
-#define UDF_QUERY_FLAG(X,Y)                     ( UDF_SB(X)->s_flags & ( 1 << (Y) ) )
+static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
-#define UDF_SET_FLAG(X,Y)                       ( UDF_SB(X)->s_flags |= ( 1 << (Y) ) )
+{
-#define UDF_CLEAR_FLAG(X,Y)                     ( UDF_SB(X)->s_flags &= ~( 1 << (Y) ) )
+        return test_bit(flag, &UDF_SB(sb)->s_flags);
+}
+static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
+{
+        set_bit(flag, &UDF_SB(sb)->s_flags);
+}
+static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
+{
+        clear_bit(flag, &UDF_SB(sb)->s_flags);
+}
 #endif /* __LINUX_UDF_SB_H */
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 6995ab1f4305..eba48209f9f3 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -111,6 +111,8 @@ struct extent_position {
 };
 /* super.c */
+__attribute__((format(printf, 3, 4)))
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
 static inline void udf_updated_lvid(struct super_block *sb)
 {
@@ -123,6 +125,7 @@ static inline void udf_updated_lvid(struct super_block *sb)
        sb->s_dirt = 1;
        UDF_SB(sb)->s_lvid_dirty = 1;
 }
+extern u64 lvid_get_unique_id(struct super_block *sb);
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -133,7 +136,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern int udf_sync_inode(struct inode *);
 extern void udf_expand_file_adinicb(struct inode *, int, int *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768a..30c8f223253d 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,6 +1,7 @@
 config UFS_FS
        tristate "UFS file system support (read only)"
        depends on BLOCK
+        depends on BKL # probably fixable
        help
          BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
          OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02b1fb3..12f39b9e4437 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        error = ufs_add_nondir(dentry, inode);
        unlock_kernel();
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index d510c1b91817..2c61ac5d4e48 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -696,6 +696,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        unsigned maxsymlen;
        int ret = -EINVAL;
+        lock_kernel();
        uspi = NULL;
        ubh = NULL;
        flags = 0;
@@ -1163,6 +1165,7 @@ magic_found:
                        goto failed;
        UFSD("EXIT\n");
+        unlock_kernel();
        return 0;
 dalloc_failed:
@@ -1174,10 +1177,12 @@ failed:
        kfree(sbi);
        sb->s_fs_info = NULL;
        UFSD("EXIT (FAILED)\n");
+        unlock_kernel();
        return ret;
 failed_nomem:
        UFSD("EXIT (NOMEM)\n");
+        unlock_kernel();
        return -ENOMEM;
 }
@@ -1407,11 +1412,18 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ufs_destroy_inode(struct inode *inode)
+static void ufs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
+static void ufs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ufs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
@@ -1449,16 +1461,16 @@ static const struct super_operations ufs_super_ops = {
        .show_options   = ufs_show_options,
 };
-static int ufs_get_sb(struct file_system_type *fs_type,
+static struct dentry *ufs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
 }
 static struct file_system_type ufs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ufs",
-        .get_sb         = ufs_get_sb,
+        .mount          = ufs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 480f28127f09..6100ec0fa1d4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,7 @@ config XFS_FS
 config XFS_QUOTA
        bool "XFS Quota support"
        depends on XFS_FS
+        select QUOTACTL
        help
          If you say Y here, you will be able to set limits for disk usage on
          a per user and/or a per group basis under XFS.  XFS considers quota
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..faca44997099 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   kmem.o \
                                   xfs_aops.o \
                                   xfs_buf.o \
+                                   xfs_discard.o \
                                   xfs_export.o \
                                   xfs_file.o \
                                   xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-typedef struct sv_s {
-        wait_queue_head_t waiters;
-} sv_t;
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(TASK_UNINTERRUPTIBLE);
-        spin_unlock(lock);
-        schedule();
-        remove_wait_queue(&sv->waiters, &wait);
-}
-#define sv_init(sv,flag,name) \
-        init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-        /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock)
-#define sv_signal(sv) \
-        wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-        wake_up_all(&(sv)->waiters)
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3d..39f4f809bb68 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-xfs_check_acl(struct inode *inode, int mask)
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip;
        struct posix_acl *acl;
        int error = -EAGAIN;
+        ip = XFS_I(inode);
        trace_xfs_check_acl(ip);
        /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
        if (!XFS_IFORK_Q(ip))
                return -EAGAIN;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..ec7bbb5645b6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-        IO_READ,        /* mapping for a read */
-        IO_DELAY,       /* mapping covers delalloc region */
-        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
-        IO_NEW          /* just allocated */
-};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
        xfs_fsize_t             isize;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -244,10 +232,8 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IO_READ) {
+        error = xfs_setfilesize(ioend);
-                error = xfs_setfilesize(ioend);
+        ASSERT(!error || error == EAGAIN);
-                ASSERT(!error || error == EAGAIN);
-        }
        /*
         * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
 xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
-        ssize_t                 count,
        struct xfs_bmbt_irec    *imap,
-        int                     flags)
+        int                     type,
+        int                     nonblocking)
 {
-        int                     nmaps = 1;
+        struct xfs_inode        *ip = XFS_I(inode);
-        int                     new = 0;
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 count = 1 << inode->i_blkbits;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+        int                     nimaps = 1;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
+        if (type == IO_UNWRITTEN)
+                bmapi_flags |= XFS_BMAPI_IGSTATE;
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                if (nonblocking)
+                        return -XFS_ERROR(EAGAIN);
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+        }
+        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+               (ip->i_df.if_flags & XFS_IFEXTENTS));
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + count > mp->m_maxioffset)
+                count = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                          bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+        if (error)
+                return -XFS_ERROR(error);
+        if (type == IO_DELALLOC &&
+            (!nimaps || isnullstartblock(imap->br_startblock))) {
+                error = xfs_iomap_write_allocate(ip, offset, count, imap);
+                if (!error)
+                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+                return -XFS_ERROR(error);
+        }
+#ifdef DEBUG
+        if (type == IO_UNWRITTEN) {
+                ASSERT(nimaps);
+                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+        }
+#endif
+        if (nimaps)
+                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+        return 0;
 }
 STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
                   WRITE_SYNC_PLUG : WRITE, bio);
-        ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-        bio_put(bio);
 }
 STATIC struct bio *
 xfs_alloc_ioend_bio(
        struct buffer_head      *bh)
 {
-        struct bio              *bio;
        int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
-        do {
-                bio = bio_alloc(GFP_NOIO, nvecs);
-                nvecs >>= 1;
-        } while (!bio);
        ASSERT(bio->bi_private == NULL);
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio_get(bio);
        return bio;
 }
@@ -470,9 +497,8 @@ xfs_submit_ioend(
        /* Pass 1 - start writeback */
        do {
                next = ioend->io_list;
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
                        xfs_start_buffer_writeback(bh);
-                }
        } while ((ioend = next) != NULL);
        /* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        lock_buffer(bh);
        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
 }
 /*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-        struct page             *page,
-        unsigned int            pg_offset)
-{
-        struct buffer_head      *bh, *head;
-        int                     ret = 0;
-        if (PageWriteback(page))
-                return 0;
-        if (!PageDirty(page))
-                return 0;
-        if (!page->mapping)
-                return 0;
-        if (!page_has_buffers(page))
-                return 0;
-        bh = head = page_buffers(page);
-        do {
-                if (!buffer_uptodate(bh))
-                        break;
-                if (!buffer_mapped(bh))
-                        break;
-                ret += bh->b_size;
-                if (ret >= pg_offset)
-                        break;
-        } while ((bh = bh->b_this_page) != head);
-        return ret;
-}
-STATIC size_t
-xfs_probe_cluster(
-        struct inode            *inode,
-        struct page             *startpage,
-        struct buffer_head      *bh,
-        struct buffer_head      *head)
-{
-        struct pagevec          pvec;
-        pgoff_t                 tindex, tlast, tloff;
-        size_t                  total = 0;
-        int                     done = 0, i;
-        /* First sum forwards in this page */
-        do {
-                if (!buffer_uptodate(bh) || !buffer_mapped(bh))
-                        return total;
-                total += bh->b_size;
-        } while ((bh = bh->b_this_page) != head);
-        /* if we reached the end of the page, sum forwards in following pages */
-        tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-        tindex = startpage->index + 1;
-        /* Prune this back to avoid pathological behavior */
-        tloff = min(tlast, startpage->index + 64);
-        pagevec_init(&pvec, 0);
-        while (!done && tindex <= tloff) {
-                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                        break;
-                for (i = 0; i < pagevec_count(&pvec); i++) {
-                        struct page *page = pvec.pages[i];
-                        size_t pg_offset, pg_len = 0;
-                        if (tindex == tlast) {
-                                pg_offset =
-                                    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-                                if (!pg_offset) {
-                                        done = 1;
-                                        break;
-                                }
-                        } else
-                                pg_offset = PAGE_CACHE_SIZE;
-                        if (page->index == tindex && trylock_page(page)) {
-                                pg_len = xfs_probe_page(page, pg_offset);
-                                unlock_page(page);
-                        }
-                        if (!pg_len) {
-                                done = 1;
-                                break;
-                        }
-                        total += pg_len;
-                        tindex++;
-                }
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-        return total;
-}
-/*
 * Test if a given page is suitable for writing as part of an unwritten
 * or delayed allocate extent.
 */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
                        if (buffer_unwritten(bh))
                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IO_DELAY);
+                                acceptable = (type == IO_DELALLOC);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IO_NEW);
+                                acceptable = (type == IO_OVERWRITE);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
        loff_t                  tindex,
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc,
+        struct writeback_control *wbc)
-        int                     all_bh)
 {
        struct buffer_head      *bh, *head;
        xfs_off_t               end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
                        continue;
                }
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
+                if (buffer_unwritten(bh) || buffer_delay(bh) ||
+                    buffer_mapped(bh)) {
                        if (buffer_unwritten(bh))
                                type = IO_UNWRITTEN;
+                        else if (buffer_delay(bh))
+                                type = IO_DELALLOC;
                        else
-                                type = IO_DELAY;
+                                type = IO_OVERWRITE;
                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                        lock_buffer(bh);
-                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+                        if (type != IO_OVERWRITE)
+                                xfs_map_at_offset(inode, bh, imap, offset);
-                        xfs_map_at_offset(inode, bh, imap, offset);
                        xfs_add_to_ioend(inode, bh, offset, type,
                                         ioendp, done);
                        page_dirty--;
                        count++;
                } else {
-                        type = IO_NEW;
+                        done = 1;
-                        if (buffer_mapped(bh) && all_bh) {
-                                lock_buffer(bh);
-                                xfs_add_to_ioend(inode, bh, offset,
-                                                type, ioendp, done);
-                                count++;
-                                page_dirty--;
-                        } else {
-                                done = 1;
-                        }
                }
        } while (offset += len, (bh = bh->b_this_page) != head);
@@ -876,7 +790,6 @@ xfs_cluster_write(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     all_bh,
        pgoff_t                 tlast)
 {
        struct pagevec          pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc, all_bh);
+                                        imap, ioendp, wbc);
                        if (done)
                                break;
                }
@@ -934,9 +847,8 @@ xfs_aops_discard_page(
        struct xfs_inode        *ip = XFS_I(inode);
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        ssize_t                 len = 1 << inode->i_blkbits;
-        if (!xfs_is_delayed_page(page, IO_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELALLOC))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -949,58 +861,14 @@ xfs_aops_discard_page(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        bh = head = page_buffers(page);
        do {
-                int             done;
-                xfs_fileoff_t   offset_fsb;
-                xfs_bmbt_irec_t imap;
-                int             nimaps = 1;
                int             error;
-                xfs_fsblock_t   firstblock;
+                xfs_fileoff_t   start_fsb;
-                xfs_bmap_free_t flist;
                if (!buffer_delay(bh))
                        goto next_buffer;
-                offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
-                /*
-                 * Map the range first and check that it is a delalloc extent
-                 * before trying to unmap the range. Otherwise we will be
-                 * trying to remove a real extent (which requires a
-                 * transaction) or a hole, which is probably a bad idea...
-                 */
-                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
-                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-                                &nimaps, NULL);
-                if (error) {
-                        /* something screwed, just bail */
-                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
-                                "page discard failed delalloc mapping lookup.");
-                        }
-                        break;
-                }
-                if (!nimaps) {
-                        /* nothing there */
-                        goto next_buffer;
-                }
-                if (imap.br_startblock != DELAYSTARTBLOCK) {
-                        /* been converted, ignore */
-                        goto next_buffer;
-                }
-                WARN_ON(imap.br_blockcount == 0);
-                /*
-                 * Note: while we initialise the firstblock/flist pair, they
-                 * should never be used because blocks should never be
-                 * allocated or freed for a delalloc extent and hence we need
-                 * don't cancel or finish them after the xfs_bunmapi() call.
-                 */
-                xfs_bmap_init(&flist, &firstblock);
-                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-                                        &flist, &done);
-                ASSERT(!flist.xbf_count && !flist.xbf_first);
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +878,7 @@ xfs_aops_discard_page(
                        break;
                }
 next_buffer:
-                offset += len;
+                offset += 1 << inode->i_blkbits;
        } while ((bh = bh->b_this_page) != head);
@@ -1047,10 +915,10 @@ xfs_vm_writepage(
        unsigned int            type;
        __uint64_t              end_offset;
        pgoff_t                 end_index, last_index;
-        ssize_t                 size, len;
+        ssize_t                 len;
-        int                     flags, err, imap_valid = 0, uptodate = 1;
+        int                     err, imap_valid = 0, uptodate = 1;
        int                     count = 0;
-        int                     all_bh = 0;
+        int                     nonblocking = 0;
        trace_xfs_writepage(inode, page, 0);
@@ -1101,110 +969,78 @@ xfs_vm_writepage(
        bh = head = page_buffers(page);
        offset = page_offset(page);
-        flags = BMAPI_READ;
+        type = IO_OVERWRITE;
-        type = IO_NEW;
+        if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+                nonblocking = 1;
        do {
+                int new_ioend = 0;
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
                        uptodate = 0;
                /*
-                 * A hole may still be marked uptodate because discard_buffer
+                 * set_page_dirty dirties all buffers in a page, independent
-                 * leaves the flag set.
+                 * of their state.  The dirty state however is entirely
+                 * meaningless for holes (!mapped && uptodate), so skip
+                 * buffers covering holes here.
                 */
                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-                        ASSERT(!buffer_dirty(bh));
                        imap_valid = 0;
                        continue;
                }
-                if (imap_valid)
+                if (buffer_unwritten(bh)) {
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                        if (type != IO_UNWRITTEN) {
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                        int new_ioend = 0;
-                        /*
-                         * Make sure we don't use a read-only iomap
-                         */
-                        if (flags == BMAPI_READ)
-                                imap_valid = 0;
-                        if (buffer_unwritten(bh)) {
                                type = IO_UNWRITTEN;
-                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
+                                imap_valid = 0;
-                        } else if (buffer_delay(bh)) {
-                                type = IO_DELAY;
-                                flags = BMAPI_ALLOCATE;
-                                if (wbc->sync_mode == WB_SYNC_NONE &&
-                                    wbc->nonblocking)
-                                        flags |= BMAPI_TRYLOCK;
-                        }
-                        if (!imap_valid) {
-                                /*
-                                 * If we didn't have a valid mapping then we
-                                 * need to ensure that we put the new mapping
-                                 * in a new ioend structure. This needs to be
-                                 * done to ensure that the ioends correctly
-                                 * reflect the block mappings at io completion
-                                 * for unwritten extent conversion.
-                                 */
-                                new_ioend = 1;
-                                err = xfs_map_blocks(inode, offset, len,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
-                        if (imap_valid) {
+                } else if (buffer_delay(bh)) {
-                                xfs_map_at_offset(inode, bh, &imap, offset);
+                        if (type != IO_DELALLOC) {
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                type = IO_DELALLOC;
-                                                 &ioend, new_ioend);
+                                imap_valid = 0;
-                                count++;
                        }
                } else if (buffer_uptodate(bh)) {
-                        /*
+                        if (type != IO_OVERWRITE) {
-                         * we got here because the buffer is already mapped.
+                                type = IO_OVERWRITE;
-                         * That means it must already have extents allocated
+                                imap_valid = 0;
-                         * underneath it. Map the extent by reading it.
-                         */
-                        if (!imap_valid || flags != BMAPI_READ) {
-                                flags = BMAPI_READ;
-                                size = xfs_probe_cluster(inode, page, bh, head);
-                                err = xfs_map_blocks(inode, offset, size,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
+                } else {
+                        if (PageUptodate(page)) {
+                                ASSERT(buffer_mapped(bh));
+                                imap_valid = 0;
+                        }
+                        continue;
+                }
+                if (imap_valid)
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                if (!imap_valid) {
                        /*
-                         * We set the type to IO_NEW in case we are doing a
+                         * If we didn't have a valid mapping then we need to
-                         * small write at EOF that is extending the file but
+                         * put the new mapping into a separate ioend structure.
-                         * without needing an allocation. We need to update the
+                         * This ensures non-contiguous extents always have
-                         * file size on I/O completion in this case so it is
+                         * separate ioends, which is particularly important
-                         * the same case as having just allocated a new extent
+                         * for unwritten extent conversion at I/O completion
-                         * that we are writing into for the first time.
+                         * time.
                         */
-                        type = IO_NEW;
+                        new_ioend = 1;
-                        if (trylock_buffer(bh)) {
+                        err = xfs_map_blocks(inode, offset, &imap, type,
-                                if (imap_valid)
+                                             nonblocking);
-                                        all_bh = 1;
+                        if (err)
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                goto error;
-                                                &ioend, !imap_valid);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                                count++;
+                }
-                        } else {
+                if (imap_valid) {
-                                imap_valid = 0;
+                        lock_buffer(bh);
-                        }
+                        if (type != IO_OVERWRITE)
-                } else if (PageUptodate(page)) {
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                        ASSERT(buffer_mapped(bh));
+                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                        imap_valid = 0;
+                                         new_ioend);
+                        count++;
                }
                if (!iohead)
@@ -1233,7 +1069,7 @@ xfs_vm_writepage(
                        end_index = last_index;
                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                        wbc, all_bh, end_index);
+                                  wbc, end_index);
        }
        if (iohead)
@@ -1302,13 +1138,19 @@ __xfs_get_blocks(
        int                     create,
        int                     direct)
 {
-        int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     lockmode = 0;
        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     nimap = 1;
        int                     new = 0;
-        int                     error;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1317,15 +1159,45 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
-        if (direct && create)
+        if (create) {
-                flags |= BMAPI_DIRECT;
+                lockmode = XFS_ILOCK_EXCL;
+                xfs_ilock(ip, lockmode);
+        } else {
+                lockmode = xfs_ilock_map_shared(ip);
+        }
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + size > mp->m_maxioffset)
+                size = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-                          &new);
+                          XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
        if (error)
-                return -error;
+                goto out_unlock;
-        if (nimap == 0)
-                return 0;
+        if (create &&
+            (!nimaps ||
+             (imap.br_startblock == HOLESTARTBLOCK ||
+              imap.br_startblock == DELAYSTARTBLOCK))) {
+                if (direct) {
+                        error = xfs_iomap_write_direct(ip, offset, size,
+                                                       &imap, nimaps);
+                } else {
+                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
+                }
+                if (error)
+                        goto out_unlock;
+                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+        } else if (nimaps) {
+                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+        } else {
+                trace_xfs_get_blocks_notfound(ip, offset, size);
+                goto out_unlock;
+        }
+        xfs_iunlock(ip, lockmode);
        if (imap.br_startblock != HOLESTARTBLOCK &&
            imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1392,6 +1264,10 @@ __xfs_get_blocks(
        }
        return 0;
+out_unlock:
+        xfs_iunlock(ip, lockmode);
+        return -error;
 }
 int
@@ -1479,7 +1355,7 @@ xfs_vm_direct_IO(
        ssize_t                 ret;
        if (rw & WRITE) {
-                iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+                iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
@@ -1505,11 +1381,42 @@ xfs_vm_write_failed(
        struct inode            *inode = mapping->host;
        if (to > inode->i_size) {
-                struct iattr    ia = {
+                /*
-                        .ia_valid       = ATTR_SIZE | ATTR_FORCE,
+                 * punch out the delalloc blocks we have already allocated. We
-                        .ia_size        = inode->i_size,
+                 * don't call xfs_setattr() to do this as we may be in the
-                };
+                 * middle of a multi-iovec write and so the vfs inode->i_size
-                xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+                 * will not match the xfs ip->i_size and so it will zero too
+                 * much. Hence we jus truncate the page cache to zero what is
+                 * necessary and punch the delalloc blocks directly.
+                 */
+                struct xfs_inode        *ip = XFS_I(inode);
+                xfs_fileoff_t           start_fsb;
+                xfs_fileoff_t           end_fsb;
+                int                     error;
+                truncate_pagecache(inode, to, inode->i_size);
+                /*
+                 * Check if there are any blocks that are outside of i_size
+                 * that need to be trimmed back.
+                 */
+                start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+                end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+                if (end_fsb <= start_fsb)
+                        return;
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                                        end_fsb - start_fsb);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "xfs_vm_write_failed: unable to clean up ino %lld",
+                                                ip->i_ino);
+                        }
+                }
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
 }
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_DIRECT = 0,  /* special case for direct I/O ioends */
+        IO_DELALLOC,    /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_OVERWRITE,   /* mapping covers already allocated extent */
+};
+#define XFS_IO_TYPES \
+        { 0,                    "" }, \
+        { IO_DELALLOC,          "delalloc" }, \
+        { IO_UNWRITTEN,         "unwritten" }, \
+        { IO_OVERWRITE,         "overwrite" }
+/*
 * xfs_ioend struct manages large extent writes for XFS.
 * It can manage several multi-page bio's at once.
 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21dae..ac1c7e8378dd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
-        .shrink = xfsbufd_wakeup,
-        .seeks = DEFAULT_SEEKS,
-};
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
 }
 /*
- *      Internal xfs_buf_t object manipulation
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
 */
+STATIC void
+xfs_buf_lru_add(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        spin_lock(&btp->bt_lru_lock);
+        if (list_empty(&bp->b_lru)) {
+                atomic_inc(&bp->b_hold);
+                list_add_tail(&bp->b_lru, &btp->bt_lru);
+                btp->bt_lru_nr++;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        if (list_empty(&bp->b_lru))
+                return;
+        spin_lock(&btp->bt_lru_lock);
+        if (!list_empty(&bp->b_lru)) {
+                list_del_init(&bp->b_lru);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+        struct xfs_buf  *bp)
+{
+        bp->b_flags |= XBF_STALE;
+        atomic_set(&(bp)->b_lru_ref, 0);
+        if (!list_empty(&bp->b_lru)) {
+                struct xfs_buftarg *btp = bp->b_target;
+                spin_lock(&btp->bt_lru_lock);
+                if (!list_empty(&bp->b_lru)) {
+                        list_del_init(&bp->b_lru);
+                        btp->bt_lru_nr--;
+                        atomic_dec(&bp->b_hold);
+                }
+                spin_unlock(&btp->bt_lru_lock);
+        }
+        ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
 STATIC void
 _xfs_buf_initialize(
@@ -186,10 +252,12 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
+        atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
+        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
-        INIT_LIST_HEAD(&bp->b_hash_list);
+        RB_CLEAR_NODE(&bp->b_rbnode);
-        init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
+        sema_init(&bp->b_sema, 0); /* held, no waiters */
        XB_SET_OWNER(bp);
        bp->b_target = target;
        bp->b_file_offset = range_base;
@@ -262,7 +330,7 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
-        ASSERT(list_empty(&bp->b_hash_list));
+        ASSERT(list_empty(&bp->b_lru));
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
@@ -339,7 +407,6 @@ _xfs_buf_lookup_pages(
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -422,8 +489,10 @@ _xfs_buf_find(
 {
        xfs_off_t               range_base;
        size_t                  range_length;
-        xfs_bufhash_t           *hash;
+        struct xfs_perag        *pag;
-        xfs_buf_t               *bp, *n;
+        struct rb_node          **rbp;
+        struct rb_node          *parent;
+        xfs_buf_t               *bp;
        range_base = (ioff << BBSHIFT);
        range_length = (isize << BBSHIFT);
@@ -432,14 +501,37 @@ _xfs_buf_find(
        ASSERT(!(range_length < (1 << btp->bt_sshift)));
        ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
-        hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
+        /* get tree root */
+        pag = xfs_perag_get(btp->bt_mount,
-        spin_lock(&hash->bh_lock);
+                                xfs_daddr_to_agno(btp->bt_mount, ioff));
-        list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
+        /* walk tree */
-                ASSERT(btp == bp->b_target);
+        spin_lock(&pag->pag_buf_lock);
-                if (bp->b_file_offset == range_base &&
+        rbp = &pag->pag_buf_tree.rb_node;
-                    bp->b_buffer_length == range_length) {
+        parent = NULL;
+        bp = NULL;
+        while (*rbp) {
+                parent = *rbp;
+                bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+                if (range_base < bp->b_file_offset)
+                        rbp = &(*rbp)->rb_left;
+                else if (range_base > bp->b_file_offset)
+                        rbp = &(*rbp)->rb_right;
+                else {
+                        /*
+                         * found a block offset match. If the range doesn't
+                         * match, the only way this is allowed is if the buffer
+                         * in the cache is stale and the transaction that made
+                         * it stale has not yet committed. i.e. we are
+                         * reallocating a busy extent. Skip this buffer and
+                         * continue searching to the right for an exact match.
+                         */
+                        if (bp->b_buffer_length != range_length) {
+                                ASSERT(bp->b_flags & XBF_STALE);
+                                rbp = &(*rbp)->rb_right;
+                                continue;
+                        }
                        atomic_inc(&bp->b_hold);
                        goto found;
                }
@@ -449,41 +541,32 @@ _xfs_buf_find(
        if (new_bp) {
                _xfs_buf_initialize(new_bp, btp, range_base,
                                range_length, flags);
-                new_bp->b_hash = hash;
+                rb_link_node(&new_bp->b_rbnode, parent, rbp);
-                list_add(&new_bp->b_hash_list, &hash->bh_list);
+                rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+                /* the buffer keeps the perag reference until it is freed */
+                new_bp->b_pag = pag;
+                spin_unlock(&pag->pag_buf_lock);
        } else {
                XFS_STATS_INC(xb_miss_locked);
+                spin_unlock(&pag->pag_buf_lock);
+                xfs_perag_put(pag);
        }
-        spin_unlock(&hash->bh_lock);
        return new_bp;
 found:
-        spin_unlock(&hash->bh_lock);
+        spin_unlock(&pag->pag_buf_lock);
+        xfs_perag_put(pag);
-        /* Attempt to get the semaphore without sleeping,
+        if (xfs_buf_cond_lock(bp)) {
-         * if this does not work then we need to drop the
+                /* failed, so wait for the lock if requested. */
-         * spinlock and do a hard attempt on the semaphore.
-         */
-        if (down_trylock(&bp->b_sema)) {
                if (!(flags & XBF_TRYLOCK)) {
-                        /* wait for buffer ownership */
                        xfs_buf_lock(bp);
                        XFS_STATS_INC(xb_get_locked_waited);
                } else {
-                        /* We asked for a trylock and failed, no need
-                         * to look at file offset and length here, we
-                         * know that this buffer at least overlaps our
-                         * buffer and is locked, therefore our buffer
-                         * either does not exist, or is this buffer.
-                         */
                        xfs_buf_rele(bp);
                        XFS_STATS_INC(xb_busy_locked);
                        return NULL;
                }
-        } else {
-                /* trylock worked */
-                XB_SET_OWNER(bp);
        }
        if (bp->b_flags & XBF_STALE) {
@@ -625,8 +708,7 @@ void
 xfs_buf_readahead(
        xfs_buftarg_t           *target,
        xfs_off_t               ioff,
-        size_t                  isize,
+        size_t                  isize)
-        xfs_buf_flags_t         flags)
 {
        struct backing_dev_info *bdi;
@@ -634,8 +716,42 @@ xfs_buf_readahead(
        if (bdi_read_congested(bdi))
                return;
-        flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
+        xfs_buf_read(target, ioff, isize,
-        xfs_buf_read(target, ioff, isize, flags);
+                     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
+}
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+        struct xfs_mount        *mp,
+        struct xfs_buftarg      *target,
+        xfs_daddr_t             daddr,
+        size_t                  length,
+        int                     flags)
+{
+        xfs_buf_t               *bp;
+        int                     error;
+        bp = xfs_buf_get_uncached(target, length, flags);
+        if (!bp)
+                return NULL;
+        /* set up the buffer for a read IO */
+        xfs_buf_lock(bp);
+        XFS_BUF_SET_ADDR(bp, daddr);
+        XFS_BUF_READ(bp);
+        XFS_BUF_BUSY(bp);
+        xfsbdstrat(mp, bp);
+        error = xfs_buf_iowait(bp);
+        if (error || bp->b_error) {
+                xfs_buf_relse(bp);
+                return NULL;
+        }
+        return bp;
 }
 xfs_buf_t *
@@ -707,9 +823,10 @@ xfs_buf_associate_memory(
 }
 xfs_buf_t *
-xfs_buf_get_noaddr(
+xfs_buf_get_uncached(
+        struct xfs_buftarg      *target,
        size_t                  len,
-        xfs_buftarg_t           *target)
+        int                     flags)
 {
        unsigned long           page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
        int                     error, i;
@@ -725,7 +842,7 @@ xfs_buf_get_noaddr(
                goto fail_free_buf;
        for (i = 0; i < page_count; i++) {
-                bp->b_pages[i] = alloc_page(GFP_KERNEL);
+                bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
                if (!bp->b_pages[i])
                        goto fail_free_mem;
        }
@@ -740,7 +857,7 @@ xfs_buf_get_noaddr(
        xfs_buf_unlock(bp);
-        trace_xfs_buf_get_noaddr(bp, _RET_IP_);
+        trace_xfs_buf_get_uncached(bp, _RET_IP_);
        return bp;
 fail_free_mem:
@@ -774,29 +891,32 @@ void
 xfs_buf_rele(
        xfs_buf_t               *bp)
 {
-        xfs_bufhash_t           *hash = bp->b_hash;
+        struct xfs_perag        *pag = bp->b_pag;
        trace_xfs_buf_rele(bp, _RET_IP_);
-        if (unlikely(!hash)) {
+        if (!pag) {
-                ASSERT(!bp->b_relse);
+                ASSERT(list_empty(&bp->b_lru));
+                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
                return;
        }
+        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
-        if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-                if (bp->b_relse) {
+                if (!(bp->b_flags & XBF_STALE) &&
-                        atomic_inc(&bp->b_hold);
+                           atomic_read(&bp->b_lru_ref)) {
-                        spin_unlock(&hash->bh_lock);
+                        xfs_buf_lru_add(bp);
-                        (*(bp->b_relse)) (bp);
+                        spin_unlock(&pag->pag_buf_lock);
-                } else if (bp->b_flags & XBF_FS_MANAGED) {
-                        spin_unlock(&hash->bh_lock);
                } else {
+                        xfs_buf_lru_del(bp);
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-                        list_del_init(&bp->b_hash_list);
+                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
-                        spin_unlock(&hash->bh_lock);
+                        spin_unlock(&pag->pag_buf_lock);
+                        xfs_perag_put(pag);
                        xfs_buf_free(bp);
                }
        }
@@ -814,10 +934,18 @@ xfs_buf_rele(
 */
 /*
- *      Locks a buffer object, if it is not already locked.
+ *      Locks a buffer object, if it is not already locked.  Note that this in
- *      Note that this in no way locks the underlying pages, so it is only
+ *      no way locks the underlying pages, so it is only useful for
- *      useful for synchronizing concurrent use of buffer objects, not for
+ *      synchronizing concurrent use of buffer objects, not for synchronizing
- *      synchronizing independent access to the underlying pages.
+ *      independent access to the underlying pages.
+ *
+ *      If we come across a stale, pinned, locked buffer, we know that we are
+ *      being asked to lock a buffer that has been reallocated. Because it is
+ *      pinned, we know that the log has not been pushed to disk and hence it
+ *      will still be locked.  Rather than continuing to have trylock attempts
+ *      fail until someone else pushes the log, push it ourselves before
+ *      returning.  This means that the xfsaild will not get stuck trying
+ *      to push on stale inode buffers.
 */
 int
 xfs_buf_cond_lock(
@@ -828,6 +956,8 @@ xfs_buf_cond_lock(
        locked = down_trylock(&bp->b_sema) == 0;
        if (locked)
                XB_SET_OWNER(bp);
+        else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_target->bt_mount, 0);
        trace_xfs_buf_cond_lock(bp, _RET_IP_);
        return locked ? 0 : -EBUSY;
@@ -859,7 +989,7 @@ xfs_buf_lock(
        trace_xfs_buf_lock(bp, _RET_IP_);
        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-                xfs_log_force(bp->b_mount, 0);
+                xfs_log_force(bp->b_target->bt_mount, 0);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
        down(&bp->b_sema);
@@ -924,19 +1054,7 @@ xfs_buf_iodone_work(
        xfs_buf_t               *bp =
                container_of(work, xfs_buf_t, b_iodone_work);
-        /*
+        if (bp->b_iodone)
-         * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
-         * ordered flag and reissue them.  Because we can't tell the higher
-         * layers directly that they should not issue ordered I/O anymore, they
-         * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
-         */
-        if ((bp->b_error == EOPNOTSUPP) &&
-            (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-                trace_xfs_buf_ordered_retry(bp, _RET_IP_);
-                bp->b_flags &= ~XBF_ORDERED;
-                bp->b_flags |= _XFS_BARRIER_FAILED;
-                xfs_buf_iorequest(bp);
-        } else if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
        else if (bp->b_flags & XBF_ASYNC)
                xfs_buf_relse(bp);
@@ -982,7 +1100,6 @@ xfs_bwrite(
 {
        int                     error;
-        bp->b_mount = mp;
        bp->b_flags |= XBF_WRITE;
        bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
@@ -1003,8 +1120,6 @@ xfs_bdwrite(
 {
        trace_xfs_buf_bdwrite(bp, _RET_IP_);
-        bp->b_mount = mp;
        bp->b_flags &= ~XBF_READ;
        bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
@@ -1013,7 +1128,7 @@ xfs_bdwrite(
 /*
 * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
 * so that the proper iodone callbacks get called.
 */
 STATIC int
@@ -1030,21 +1145,21 @@ xfs_bioerror(
        XFS_BUF_ERROR(bp, EIO);
        /*
-         * We're calling biodone, so delete XBF_DONE flag.
+         * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
         */
        XFS_BUF_UNREAD(bp);
        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_UNDONE(bp);
        XFS_BUF_STALE(bp);
-        xfs_biodone(bp);
+        xfs_buf_ioend(bp, 0);
        return EIO;
 }
 /*
 * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
+ * here ourselves, and avoiding the xfs_buf_ioend call.
 * This is meant for userdata errors; metadata bufs come with
 * iodone functions attached, so that we can track down errors.
 */
@@ -1093,7 +1208,7 @@ int
 xfs_bdstrat_cb(
        struct xfs_buf  *bp)
 {
-        if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+        if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
                trace_xfs_bdstrat_shut(bp, _RET_IP_);
                /*
                 * Metadata write that didn't get logged but
@@ -1195,7 +1310,7 @@ _xfs_buf_ioapply(
        if (bp->b_flags & XBF_ORDERED) {
                ASSERT(!(bp->b_flags & XBF_READ));
-                rw = WRITE_BARRIER;
+                rw = WRITE_FLUSH_FUA;
        } else if (bp->b_flags & XBF_LOG_BUFFER) {
                ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
                bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1394,89 +1509,84 @@ xfs_buf_iomove(
 */
 /*
- *      Wait for any bufs with callbacks that have been submitted but
+ * Wait for any bufs with callbacks that have been submitted but have not yet
- *      have not yet returned... walk the hash list for the target.
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
 */
 void
 xfs_wait_buftarg(
-        xfs_buftarg_t   *btp)
+        struct xfs_buftarg      *btp)
 {
-        xfs_buf_t       *bp, *n;
+        struct xfs_buf          *bp;
-        xfs_bufhash_t   *hash;
-        uint            i;
+restart:
+        spin_lock(&btp->bt_lru_lock);
-        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
+        while (!list_empty(&btp->bt_lru)) {
-                hash = &btp->bt_hash[i];
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-again:
+                if (atomic_read(&bp->b_hold) > 1) {
-                spin_lock(&hash->bh_lock);
+                        spin_unlock(&btp->bt_lru_lock);
-                list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
+                        delay(100);
-                        ASSERT(btp == bp->b_target);
+                        goto restart;
-                        if (!(bp->b_flags & XBF_FS_MANAGED)) {
-                                spin_unlock(&hash->bh_lock);
-                                /*
-                                 * Catch superblock reference count leaks
-                                 * immediately
-                                 */
-                                BUG_ON(bp->b_bn == 0);
-                                delay(100);
-                                goto again;
-                        }
                }
-                spin_unlock(&hash->bh_lock);
+                /*
+                 * clear the LRU reference count so the bufer doesn't get
+                 * ignored in xfs_buf_rele().
+                 */
+                atomic_set(&bp->b_lru_ref, 0);
+                spin_unlock(&btp->bt_lru_lock);
+                xfs_buf_rele(bp);
+                spin_lock(&btp->bt_lru_lock);
        }
+        spin_unlock(&btp->bt_lru_lock);
 }
-/*
+int
- *      Allocate buffer hash table for a given target.
+xfs_buftarg_shrink(
- *      For devices containing metadata (i.e. not the log/realtime devices)
+        struct shrinker         *shrink,
- *      we need to allocate a much larger hash table.
+        int                     nr_to_scan,
- */
+        gfp_t                   mask)
-STATIC void
-xfs_alloc_bufhash(
-        xfs_buftarg_t           *btp,
-        int                     external)
 {
-        unsigned int            i;
+        struct xfs_buftarg      *btp = container_of(shrink,
+                                        struct xfs_buftarg, bt_shrinker);
+        struct xfs_buf          *bp;
+        LIST_HEAD(dispose);
-        btp->bt_hashshift = external ? 3 : 12;  /* 8 or 4096 buckets */
+        if (!nr_to_scan)
-        btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
+                return btp->bt_lru_nr;
-                                         sizeof(xfs_bufhash_t));
-        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-                spin_lock_init(&btp->bt_hash[i].bh_lock);
-                INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
-        }
-}
-STATIC void
+        spin_lock(&btp->bt_lru_lock);
-xfs_free_bufhash(
+        while (!list_empty(&btp->bt_lru)) {
-        xfs_buftarg_t           *btp)
+                if (nr_to_scan-- <= 0)
-{
+                        break;
-        kmem_free_large(btp->bt_hash);
-        btp->bt_hash = NULL;
-}
-/*
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
- *      buftarg list for delwrite queue processing
- */
-static LIST_HEAD(xfs_buftarg_list);
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
-STATIC void
+                /*
-xfs_register_buftarg(
+                 * Decrement the b_lru_ref count unless the value is already
-        xfs_buftarg_t           *btp)
+                 * zero. If the value is already zero, we need to reclaim the
-{
+                 * buffer, otherwise it gets another trip through the LRU.
-        spin_lock(&xfs_buftarg_lock);
+                 */
-        list_add(&btp->bt_list, &xfs_buftarg_list);
+                if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
-        spin_unlock(&xfs_buftarg_lock);
+                        list_move_tail(&bp->b_lru, &btp->bt_lru);
-}
+                        continue;
+                }
-STATIC void
+                /*
-xfs_unregister_buftarg(
+                 * remove the buffer from the LRU now to avoid needing another
-        xfs_buftarg_t           *btp)
+                 * lock round trip inside xfs_buf_rele().
-{
+                 */
-        spin_lock(&xfs_buftarg_lock);
+                list_move(&bp->b_lru, &dispose);
-        list_del(&btp->bt_list);
+                btp->bt_lru_nr--;
-        spin_unlock(&xfs_buftarg_lock);
+        }
+        spin_unlock(&btp->bt_lru_lock);
+        while (!list_empty(&dispose)) {
+                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+                list_del_init(&bp->b_lru);
+                xfs_buf_rele(bp);
+        }
+        return btp->bt_lru_nr;
 }
 void
@@ -1484,18 +1594,14 @@ xfs_free_buftarg(
        struct xfs_mount        *mp,
        struct xfs_buftarg      *btp)
 {
+        unregister_shrinker(&btp->bt_shrinker);
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
-        xfs_free_bufhash(btp);
        iput(btp->bt_mapping->host);
-        /* Unregister the buftarg first so that we don't get a
-         * wakeup finding a non-existent task
-         */
-        xfs_unregister_buftarg(btp);
        kthread_stop(btp->bt_task);
        kmem_free(btp);
 }
@@ -1572,6 +1678,7 @@ xfs_mapping_buftarg(
                        XFS_BUFTARG_NAME(btp));
                return ENOMEM;
        }
+        inode->i_ino = get_next_ino();
        inode->i_mode = S_IFBLK;
        inode->i_bdev = bdev;
        inode->i_rdev = bdev->bd_dev;
@@ -1591,24 +1698,18 @@ xfs_alloc_delwrite_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
 {
-        int     error = 0;
-        INIT_LIST_HEAD(&btp->bt_list);
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-        if (IS_ERR(btp->bt_task)) {
+        if (IS_ERR(btp->bt_task))
-                error = PTR_ERR(btp->bt_task);
+                return PTR_ERR(btp->bt_task);
-                goto out_error;
+        return 0;
-        }
-        xfs_register_buftarg(btp);
-out_error:
-        return error;
 }
 xfs_buftarg_t *
 xfs_alloc_buftarg(
+        struct xfs_mount        *mp,
        struct block_device     *bdev,
        int                     external,
        const char              *fsname)
@@ -1617,15 +1718,20 @@ xfs_alloc_buftarg(
        btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
+        INIT_LIST_HEAD(&btp->bt_lru);
+        spin_lock_init(&btp->bt_lru_lock);
        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
-        xfs_alloc_bufhash(btp, external);
+        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+        register_shrinker(&btp->bt_shrinker);
        return btp;
 error:
@@ -1730,27 +1836,6 @@ xfs_buf_runall_queues(
        flush_workqueue(queue);
 }
-STATIC int
-xfsbufd_wakeup(
-        struct shrinker         *shrink,
-        int                     priority,
-        gfp_t                   mask)
-{
-        xfs_buftarg_t           *btp;
-        spin_lock(&xfs_buftarg_lock);
-        list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-                if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-                        continue;
-                if (list_empty(&btp->bt_delwrite_queue))
-                        continue;
-                set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-                wake_up_process(btp->bt_task);
-        }
-        spin_unlock(&xfs_buftarg_lock);
-        return 0;
-}
 /*
 * Move as many buffers as specified to the supplied list
 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1771,7 +1856,6 @@ xfs_buf_delwri_split(
        INIT_LIST_HEAD(list);
        spin_lock(dwlk);
        list_for_each_entry_safe(bp, n, dwq, b_list) {
-                trace_xfs_buf_delwri_split(bp, _RET_IP_);
                ASSERT(bp->b_flags & XBF_DELWRI);
                if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1785,6 +1869,7 @@ xfs_buf_delwri_split(
                                         _XBF_RUN_QUEUES);
                        bp->b_flags |= XBF_WRITE;
                        list_move_tail(&bp->b_list, list);
+                        trace_xfs_buf_delwri_split(bp, _RET_IP_);
                } else
                        skipped++;
        }
@@ -1916,7 +2001,7 @@ xfs_flush_buftarg(
                        bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
                        list_del_init(&bp->b_list);
-                        xfs_iowait(bp);
+                        xfs_buf_iowait(bp);
                        xfs_buf_relse(bp);
                }
        }
@@ -1933,7 +2018,7 @@ xfs_buf_init(void)
                goto out;
        xfslogd_workqueue = alloc_workqueue("xfslogd",
-                                        WQ_RESCUER | WQ_HIGHPRI, 1);
+                                        WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
@@ -1945,7 +2030,6 @@ xfs_buf_init(void)
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
-        register_shrinker(&xfs_buf_shake);
        return 0;
 out_destroy_xfsdatad_workqueue:
@@ -1961,7 +2045,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        unregister_shrinker(&xfs_buf_shake);
        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614f0b92..cbe65950e524 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
 #define XBF_DONE        (1 << 5) /* all pages in the buffer uptodate */
 #define XBF_DELWRI      (1 << 6) /* buffer has dirty pages */
 #define XBF_STALE       (1 << 7) /* buffer has been staled, do not find it */
-#define XBF_FS_MANAGED  (1 << 8) /* filesystem controls freeing memory */
 #define XBF_ORDERED     (1 << 11)/* use ordered writes */
 #define XBF_READ_AHEAD  (1 << 12)/* asynchronous read-ahead */
 #define XBF_LOG_BUFFER  (1 << 13)/* this is a buffer used for the log */
@@ -86,14 +85,6 @@ typedef enum {
 */
 #define _XBF_PAGE_LOCKED        (1 << 22)
-/*
- * If we try a barrier write, but it fails we have to communicate
- * this to the upper layers.  Unfortunately b_error gets overwritten
- * when the buffer is re-issued so we have to add another flag to
- * keep this information.
- */
-#define _XFS_BARRIER_FAILED     (1 << 23)
 typedef unsigned int xfs_buf_flags_t;
 #define XFS_BUF_FLAGS \
@@ -104,7 +95,6 @@ typedef unsigned int xfs_buf_flags_t;
        { XBF_DONE,             "DONE" }, \
        { XBF_DELWRI,           "DELWRI" }, \
        { XBF_STALE,            "STALE" }, \
-        { XBF_FS_MANAGED,       "FS_MANAGED" }, \
        { XBF_ORDERED,          "ORDERED" }, \
        { XBF_READ_AHEAD,       "READ_AHEAD" }, \
        { XBF_LOCK,             "LOCK" },       /* should never be set */\
@@ -114,8 +104,7 @@ typedef unsigned int xfs_buf_flags_t;
        { _XBF_PAGES,           "PAGES" }, \
        { _XBF_RUN_QUEUES,      "RUN_QUEUES" }, \
        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
-        { _XBF_PAGE_LOCKED,     "PAGE_LOCKED" }, \
+        { _XBF_PAGE_LOCKED,     "PAGE_LOCKED" }
-        { _XFS_BARRIER_FAILED,  "BARRIER_FAILED" }
 typedef enum {
@@ -132,20 +121,22 @@ typedef struct xfs_buftarg {
        dev_t                   bt_dev;
        struct block_device     *bt_bdev;
        struct address_space    *bt_mapping;
+        struct xfs_mount        *bt_mount;
        unsigned int            bt_bsize;
        unsigned int            bt_sshift;
        size_t                  bt_smask;
-        /* per device buffer hash table */
-        uint                    bt_hashshift;
-        xfs_bufhash_t           *bt_hash;
        /* per device delwri queue */
        struct task_struct      *bt_task;
-        struct list_head        bt_list;
        struct list_head        bt_delwrite_queue;
        spinlock_t              bt_delwrite_lock;
        unsigned long           bt_flags;
+        /* LRU control structures */
+        struct shrinker         bt_shrinker;
+        struct list_head        bt_lru;
+        spinlock_t              bt_lru_lock;
+        unsigned int            bt_lru_nr;
 } xfs_buftarg_t;
 /*
@@ -161,40 +152,46 @@ typedef struct xfs_buftarg {
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 #define XB_PAGES        2
 typedef struct xfs_buf {
+        /*
+         * first cacheline holds all the fields needed for an uncontended cache
+         * hit to be fully processed. The semaphore straddles the cacheline
+         * boundary, but the counter and lock sits on the first cacheline,
+         * which is the only bit that is touched if we hit the semaphore
+         * fast-path on locking.
+         */
+        struct rb_node          b_rbnode;       /* rbtree node */
+        xfs_off_t               b_file_offset;  /* offset in file */
+        size_t                  b_buffer_length;/* size of buffer in bytes */
+        atomic_t                b_hold;         /* reference count */
+        atomic_t                b_lru_ref;      /* lru reclaim ref count */
+        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
-        unsigned long           b_queuetime;    /* time buffer was queued */
-        atomic_t                b_pin_count;    /* pin count */
+        struct list_head        b_lru;          /* lru list */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
-        xfs_buf_flags_t         b_flags;        /* status flags */
+        struct xfs_perag        *b_pag;         /* contains rbtree root */
-        struct list_head        b_hash_list;    /* hash table list */
-        xfs_bufhash_t           *b_hash;        /* hash table list start */
        xfs_buftarg_t           *b_target;      /* buffer target (device) */
-        atomic_t                b_hold;         /* reference count */
        xfs_daddr_t             b_bn;           /* block number for I/O */
-        xfs_off_t               b_file_offset;  /* offset in file */
-        size_t                  b_buffer_length;/* size of buffer in bytes */
        size_t                  b_count_desired;/* desired transfer size */
        void                    *b_addr;        /* virtual address of buffer */
        struct work_struct      b_iodone_work;
-        atomic_t                b_io_remaining; /* #outstanding I/O requests */
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
-        xfs_buf_relse_t         b_relse;        /* releasing function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
-        struct xfs_mount        *b_mount;
-        unsigned short          b_error;        /* error code on I/O */
-        unsigned int            b_page_count;   /* size of page array */
-        unsigned int            b_offset;       /* page offset in first page */
        struct page             **b_pages;      /* array of page pointers */
        struct page             *b_page_array[XB_PAGES]; /* inline pages */
+        unsigned long           b_queuetime;    /* time buffer was queued */
+        atomic_t                b_pin_count;    /* pin count */
+        atomic_t                b_io_remaining; /* #outstanding I/O requests */
+        unsigned int            b_page_count;   /* size of page array */
+        unsigned int            b_offset;       /* page offset in first page */
+        unsigned short          b_error;        /* error code on I/O */
 #ifdef XFS_BUF_LOCK_TRACKING
        int                     b_last_holder;
 #endif
@@ -213,11 +210,13 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
                                xfs_buf_flags_t);
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
-extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
+extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
-extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
+extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
-                                xfs_buf_flags_t);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
+                                struct xfs_buftarg *target,
+                                xfs_daddr_t daddr, size_t length, int flags);
 /* Releasing Buffers */
 extern void xfs_buf_free(xfs_buf_t *);
@@ -242,6 +241,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
                                xfs_buf_rw_t);
+#define xfs_buf_zero(bp, off, len) \
+            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
@@ -267,7 +268,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)   ((bp)->b_flags &= \
                ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
-#define XFS_BUF_STALE(bp)       ((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)       xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp) do {                            \
@@ -276,8 +278,6 @@ extern void xfs_buf_terminate(void);
                                        XFS_BUF_DONE(bp);       \
                                } while (0)
-#define XFS_BUF_UNMANAGE(bp)    ((bp)->b_flags &= ~XBF_FS_MANAGED)
 #define XFS_BUF_DELAYWRITE(bp)          ((bp)->b_flags |= XBF_DELWRI)
 #define XFS_BUF_UNDELAYWRITE(bp)        xfs_buf_delwri_dequeue(bp)
 #define XFS_BUF_ISDELAYWRITE(bp)        ((bp)->b_flags & XBF_DELWRI)
@@ -320,7 +320,6 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_FSPRIVATE2(bp, type)            ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)         ((bp)->b_fspriv2 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                   do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func)       ((bp)->b_relse = (func))
 #define XFS_BUF_PTR(bp)                 (xfs_caddr_t)((bp)->b_addr)
 #define XFS_BUF_SET_PTR(bp, val, cnt)   xfs_buf_associate_memory(bp, val, cnt)
@@ -333,9 +332,15 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SIZE(bp)                ((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)       ((bp)->b_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    do { } while (0)
+static inline void
+xfs_buf_set_ref(
+        struct xfs_buf  *bp,
+        int             lru_ref)
+{
+        atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)                do { } while (0)
 #define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
@@ -351,30 +356,15 @@ extern void xfs_buf_terminate(void);
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
-        if (!bp->b_relse)
+        xfs_buf_unlock(bp);
-                xfs_buf_unlock(bp);
        xfs_buf_rele(bp);
 }
-#define xfs_biodone(bp)         xfs_buf_ioend(bp, 0)
-#define xfs_biomove(bp, off, len, data, rw) \
-            xfs_buf_iomove((bp), (off), (len), (data), \
-                ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
-#define xfs_biozero(bp, off, len) \
-            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-#define xfs_iowait(bp)  xfs_buf_iowait(bp)
-#define xfs_baread(target, rablkno, ralen)  \
-        xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
 /*
 *      Handling of buftargs.
 */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+                        struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b6091..000000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CRED_H__
-#define __XFS_CRED_H__
-#include <linux/capability.h>
-/*
- * Credentials
- */
-typedef const struct cred cred_t;
-#endif  /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..05201ae719e5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+STATIC int
+xfs_trim_extents(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno,
+        xfs_fsblock_t           start,
+        xfs_fsblock_t           len,
+        xfs_fsblock_t           minlen,
+        __uint64_t              *blocks_trimmed)
+{
+        struct block_device     *bdev = mp->m_ddev_targp->bt_bdev;
+        struct xfs_btree_cur    *cur;
+        struct xfs_buf          *agbp;
+        struct xfs_perag        *pag;
+        int                     error;
+        int                     i;
+        pag = xfs_perag_get(mp, agno);
+        error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+        if (error || !agbp)
+                goto out_put_perag;
+        cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+        /*
+         * Force out the log.  This means any transactions that might have freed
+         * space before we took the AGF buffer lock are now on disk, and the
+         * volatile disk cache is flushed.
+         */
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        /*
+         * Look up the longest btree in the AGF and start with it.
+         */
+        error = xfs_alloc_lookup_le(cur, 0,
+                                    XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+        if (error)
+                goto out_del_cursor;
+        /*
+         * Loop until we are done with all extents that are large
+         * enough to be worth discarding.
+         */
+        while (i) {
+                xfs_agblock_t fbno;
+                xfs_extlen_t flen;
+                error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+                if (error)
+                        goto out_del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+                ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+                /*
+                 * Too small?  Give up.
+                 */
+                if (flen < minlen) {
+                        trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+                        goto out_del_cursor;
+                }
+                /*
+                 * If the extent is entirely outside of the range we are
+                 * supposed to discard skip it.  Do not bother to trim
+                 * down partially overlapping ranges for now.
+                 */
+                if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+                    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+                        trace_xfs_discard_exclude(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                /*
+                 * If any blocks in the range are still busy, skip the
+                 * discard and try again the next time.
+                 */
+                if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+                        trace_xfs_discard_busy(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                trace_xfs_discard_extent(mp, agno, fbno, flen);
+                error = -blkdev_issue_discard(bdev,
+                                XFS_AGB_TO_DADDR(mp, agno, fbno),
+                                XFS_FSB_TO_BB(mp, flen),
+                                GFP_NOFS, 0);
+                if (error)
+                        goto out_del_cursor;
+                *blocks_trimmed += flen;
+next_extent:
+                error = xfs_btree_decrement(cur, 0, &i);
+                if (error)
+                        goto out_del_cursor;
+        }
+out_del_cursor:
+        xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        xfs_buf_relse(agbp);
+out_put_perag:
+        xfs_perag_put(pag);
+        return error;
+}
+int
+xfs_ioc_trim(
+        struct xfs_mount                *mp,
+        struct fstrim_range __user      *urange)
+{
+        struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+        unsigned int            granularity = q->limits.discard_granularity;
+        struct fstrim_range     range;
+        xfs_fsblock_t           start, len, minlen;
+        xfs_agnumber_t          start_agno, end_agno, agno;
+        __uint64_t              blocks_trimmed = 0;
+        int                     error, last_error = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&range, urange, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        /*
+         * Truncating down the len isn't actually quite correct, but using
+         * XFS_B_TO_FSB would mean we trivially get overflows for values
+         * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+         * used by the fstrim application.  In the end it really doesn't
+         * matter as trimming blocks is an advisory interface.
+         */
+        start = XFS_B_TO_FSBT(mp, range.start);
+        len = XFS_B_TO_FSBT(mp, range.len);
+        minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+        start_agno = XFS_FSB_TO_AGNO(mp, start);
+        if (start_agno >= mp->m_sb.sb_agcount)
+                return -XFS_ERROR(EINVAL);
+        end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+        if (end_agno >= mp->m_sb.sb_agcount)
+                end_agno = mp->m_sb.sb_agcount - 1;
+        for (agno = start_agno; agno <= end_agno; agno++) {
+                error = -xfs_trim_extents(mp, agno, start, len, minlen,
+                                          &blocks_trimmed);
+                if (error)
+                        last_error = error;
+        }
+        if (last_error)
+                return last_error;
+        range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+        if (copy_to_user(urange, &range, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+struct fstrim_range;
+extern int      xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..fc0114da7fdd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
-        /* filesystem may contain 64bit inode numbers */
+        /*
-        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+         * If the the filesystem may contain 64bit inode numbers, we need
+         * to use larger file handles that can represent them.
+         *
+         * While we only allocate inodes that do not fit into 32 bits any
+         * large enough filesystem may contain them, thus the slightly
+         * confusing looking conditional below.
+         */
+        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+            (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
                fileid_type |= XFS_FILEID_TYPE_64FLAG;
        /*
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..a55c1b46b219 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
 #include "xfs_trace.h"
 #include <linux/dcache.h>
+#include <linux/falloc.h>
 static const struct vm_operations_struct xfs_file_vm_ops;
 /*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_lock(&VFS_I(ip)->i_mutex);
+        xfs_ilock(ip, type);
+}
+static inline void
+xfs_rw_iunlock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_iunlock(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+static inline void
+xfs_rw_ilock_demote(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_ilock_demote(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+/*
 *      xfs_iozero
 *
 *      xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        if (unlikely(ioflags & IO_ISDIRECT))
-                mutex_lock(&inode->i_mutex);
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (unlikely(ioflags & IO_ISDIRECT)) {
+                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
                        ret = -xfs_flushinval_pages(ip,
                                        (iocb->ki_pos & PAGE_CACHE_MASK),
                                        -1, FI_REMAPF_LOCKED);
+                        if (ret) {
+                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+                                return ret;
+                        }
                }
-                mutex_unlock(&inode->i_mutex);
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-                if (ret) {
+        } else
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-                        return ret;
-                }
-        }
        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
@@ -285,7 +319,7 @@ xfs_file_aio_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
@@ -309,7 +343,7 @@ xfs_file_splice_read(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
@@ -317,10 +351,61 @@ xfs_file_splice_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
+STATIC void
+xfs_aio_write_isize_update(
+        struct inode    *inode,
+        loff_t          *ppos,
+        ssize_t         bytes_written)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        xfs_fsize_t             isize = i_size_read(inode);
+        if (bytes_written > 0)
+                XFS_STATS_ADD(xs_write_bytes, bytes_written);
+        if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+                                        *ppos > isize))
+                *ppos = isize;
+        if (*ppos > ip->i_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                if (*ppos > ip->i_size)
+                        ip->i_size = *ppos;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occured.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+        struct xfs_inode        *ip)
+{
+        if (ip->i_new_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                ip->i_new_size = 0;
+                if (ip->i_d.di_size > ip->i_size)
+                        ip->i_d.di_size = ip->i_size;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
 STATIC ssize_t
 xfs_file_splice_write(
        struct pipe_inode_info  *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fsize_t             isize, new_size;
+        xfs_fsize_t             new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -355,27 +440,9 @@ xfs_file_splice_write(
        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-        if (ret > 0)
-                XFS_STATS_ADD(xs_write_bytes, ret);
-        isize = i_size_read(inode);
-        if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
-                *ppos = isize;
-        if (*ppos > ip->i_size) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (*ppos > ip->i_size)
-                        ip->i_size = *ppos;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        if (ip->i_new_size) {
+        xfs_aio_write_isize_update(inode, ppos, ret);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_aio_write_newsize_update(ip);
-                ip->i_new_size = 0;
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -562,247 +629,314 @@ out_lock:
        return error;
 }
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
 STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_aio_write_checks(
-        struct kiocb            *iocb,
+        struct file             *file,
-        const struct iovec      *iovp,
+        loff_t                  *pos,
-        unsigned long           nr_segs,
+        size_t                  *count,
-        loff_t                  pos)
+        int                     *iolock)
 {
-        struct file             *file = iocb->ki_filp;
+        struct inode            *inode = file->f_mapping->host;
-        struct address_space    *mapping = file->f_mapping;
-        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fsize_t             new_size;
-        ssize_t                 ret = 0, error = 0;
+        int                     error = 0;
-        int                     ioflags = 0;
-        xfs_fsize_t             isize, new_size;
-        int                     iolock;
-        size_t                  ocount = 0, count;
-        int                     need_i_mutex;
-        XFS_STATS_INC(xs_write_calls);
+        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+        if (error) {
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+                *iolock = 0;
+                return error;
+        }
-        BUG_ON(iocb->ki_pos != pos);
+        new_size = *pos + *count;
+        if (new_size > ip->i_size)
+                ip->i_new_size = new_size;
-        if (unlikely(file->f_flags & O_DIRECT))
+        if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-                ioflags |= IO_ISDIRECT;
+                file_update_time(file);
-        if (file->f_mode & FMODE_NOCMTIME)
-                ioflags |= IO_INVIS;
+        /*
+         * If the offset is beyond the size of the file, we need to zero any
+         * blocks that fall between the existing EOF and the start of this
+         * write.
+         */
+        if (*pos > ip->i_size)
+                error = -xfs_zero_eof(ip, *pos, ip->i_size);
-        error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
                return error;
-        count = ocount;
+        /*
-        if (count == 0)
+         * If we're writing the file then make sure to clear the setuid and
-                return 0;
+         * setgid bits if the process is not being run by root.  This keeps
+         * people from modifying setuid and setgid binaries.
-        xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+         */
+        return file_remove_suid(file);
-        if (XFS_FORCED_SHUTDOWN(mp))
+}
-                return -EIO;
-relock:
+/*
-        if (ioflags & IO_ISDIRECT) {
+ * xfs_file_dio_aio_write - handle direct IO writes
-                iolock = XFS_IOLOCK_SHARED;
+ *
-                need_i_mutex = 0;
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
-        } else {
+ * By separating it from the buffered write path we remove all the tricky to
-                iolock = XFS_IOLOCK_EXCL;
+ * follow locking changes and looping.
-                need_i_mutex = 1;
+ *
-                mutex_lock(&inode->i_mutex);
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+        struct kiocb            *iocb,
+        const struct iovec      *iovp,
+        unsigned long           nr_segs,
+        loff_t                  pos,
+        size_t                  ocount,
+        int                     *iolock)
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 ret = 0;
+        size_t                  count = ocount;
+        int                     unaligned_io = 0;
+        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
+                                        mp->m_rtdev_targp : mp->m_ddev_targp;
+        *iolock = 0;
+        if ((pos & target->bt_smask) || (count & target->bt_smask))
+                return -XFS_ERROR(EINVAL);
+        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+                unaligned_io = 1;
+        if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+                *iolock = XFS_IOLOCK_EXCL;
+        else
+                *iolock = XFS_IOLOCK_SHARED;
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+        if (ret)
+                return ret;
+        if (mapping->nrpages) {
+                WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+                                                        FI_REMAPF_LOCKED);
+                if (ret)
+                        return ret;
        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+        /*
+         * If we are doing unaligned IO, wait for all other IO to drain,
-start:
+         * otherwise demote the lock if we had to flush cached pages
-        error = -generic_write_checks(file, &pos, &count,
+         */
-                                        S_ISBLK(inode->i_mode));
+        if (unaligned_io)
-        if (error) {
+                xfs_ioend_wait(ip);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        else if (*iolock == XFS_IOLOCK_EXCL) {
-                goto out_unlock_mutex;
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                *iolock = XFS_IOLOCK_SHARED;
        }
-        if (ioflags & IO_ISDIRECT) {
+        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-                xfs_buftarg_t   *target =
+        ret = generic_file_direct_write(iocb, iovp,
-                        XFS_IS_REALTIME_INODE(ip) ?
+                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+        /* No fallback to buffered IO on errors for XFS. */
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        ASSERT(ret < 0 || ret == count);
-                        return XFS_ERROR(-EINVAL);
+        return ret;
-                }
+}
-                if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
+STATIC ssize_t
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+xfs_file_buffered_aio_write(
-                        iolock = XFS_IOLOCK_EXCL;
+        struct kiocb            *iocb,
-                        need_i_mutex = 1;
+        const struct iovec      *iovp,
-                        mutex_lock(&inode->i_mutex);
+        unsigned long           nr_segs,
-                        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+        loff_t                  pos,
-                        goto start;
+        size_t                  ocount,
-                }
+        int                     *iolock)
-        }
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     enospc = 0;
+        size_t                  count = ocount;
-        new_size = pos + count;
+        *iolock = XFS_IOLOCK_EXCL;
-        if (new_size > ip->i_size)
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
-                ip->i_new_size = new_size;
-        if (likely(!(ioflags & IO_INVIS)))
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
-                file_update_time(file);
+        if (ret)
+                return ret;
+        /* We can write back this queue in page reclaim */
+        current->backing_dev_info = mapping->backing_dev_info;
+write_retry:
+        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+        ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+                        pos, &iocb->ki_pos, count, ret);
        /*
-         * If the offset is beyond the size of the file, we have a couple
+         * if we just got an ENOSPC, flush the inode now we aren't holding any
-         * of things to do. First, if there is already space allocated
+         * page locks and retry *once*
-         * we need to either create holes or zero the disk or ...
-         *
-         * If there is a page where the previous size lands, we need
-         * to zero it out up to the new size.
         */
+        if (ret == -ENOSPC && !enospc) {
-        if (pos > ip->i_size) {
+                ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                error = xfs_zero_eof(ip, pos, ip->i_size);
+                if (ret)
-                if (error) {
+                        return ret;
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                enospc = 1;
-                        goto out_unlock_internal;
+                goto write_retry;
-                }
        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        current->backing_dev_info = NULL;
+        return ret;
+}
-        /*
+STATIC ssize_t
-         * If we're writing the file then make sure to clear the
+xfs_file_aio_write(
-         * setuid and setgid bits if the process is not being run
+        struct kiocb            *iocb,
-         * by root.  This keeps people from modifying setuid and
+        const struct iovec      *iovp,
-         * setgid binaries.
+        unsigned long           nr_segs,
-         */
+        loff_t                  pos)
-        error = -file_remove_suid(file);
+{
-        if (unlikely(error))
+        struct file             *file = iocb->ki_filp;
-                goto out_unlock_internal;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     iolock;
+        size_t                  ocount = 0;
-        /* We can write back this queue in page reclaim */
+        XFS_STATS_INC(xs_write_calls);
-        current->backing_dev_info = mapping->backing_dev_info;
-        if ((ioflags & IO_ISDIRECT)) {
+        BUG_ON(iocb->ki_pos != pos);
-                if (mapping->nrpages) {
-                        WARN_ON(need_i_mutex == 0);
-                        error = xfs_flushinval_pages(ip,
-                                        (pos & PAGE_CACHE_MASK),
-                                        -1, FI_REMAPF_LOCKED);
-                        if (error)
-                                goto out_unlock_internal;
-                }
-                if (need_i_mutex) {
+        ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-                        /* demote the lock now the cached pages are gone */
+        if (ret)
-                        xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                return ret;
-                        mutex_unlock(&inode->i_mutex);
-                        iolock = XFS_IOLOCK_SHARED;
+        if (ocount == 0)
-                        need_i_mutex = 0;
+                return 0;
-                }
-                trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
+        xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
-                ret = generic_file_direct_write(iocb, iovp,
-                                &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                /*
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                 * direct-io write to a hole: fall through to buffered I/O
+                return -EIO;
-                 * for completing the rest of the request.
-                 */
-                if (ret >= 0 && ret != count) {
-                        XFS_STATS_ADD(xs_write_bytes, ret);
-                        pos += ret;
+        if (unlikely(file->f_flags & O_DIRECT))
-                        count -= ret;
+                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
+        else
+                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
-                        ioflags &= ~IO_ISDIRECT;
+        xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-                        xfs_iunlock(ip, iolock);
-                        goto relock;
-                }
-        } else {
-                int enospc = 0;
-                ssize_t ret2 = 0;
-write_retry:
+        if (ret <= 0)
-                trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
+                goto out_unlock;
-                ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
-                                pos, &iocb->ki_pos, count, ret);
-                /*
-                 * if we just got an ENOSPC, flush the inode now we
-                 * aren't holding any page locks and retry *once*
-                 */
-                if (ret2 == -ENOSPC && !enospc) {
-                        error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                        if (error)
-                                goto out_unlock_internal;
-                        enospc = 1;
-                        goto write_retry;
-                }
-                ret = ret2;
-        }
-        current->backing_dev_info = NULL;
+        /* Handle various SYNC-type writes */
+        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                loff_t end = pos + ret - 1;
+                int error, error2;
-        isize = i_size_read(inode);
+                xfs_rw_iunlock(ip, iolock);
-        if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
+                error = filemap_write_and_wait_range(mapping, pos, end);
-                iocb->ki_pos = isize;
+                xfs_rw_ilock(ip, iolock);
-        if (iocb->ki_pos > ip->i_size) {
+                error2 = -xfs_file_fsync(file,
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                                         (file->f_flags & __O_SYNC) ? 0 : 1);
-                if (iocb->ki_pos > ip->i_size)
+                if (error)
-                        ip->i_size = iocb->ki_pos;
+                        ret = error;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                else if (error2)
+                        ret = error2;
        }
-        error = -ret;
+out_unlock:
-        if (ret <= 0)
+        xfs_aio_write_newsize_update(ip);
-                goto out_unlock_internal;
+        xfs_rw_iunlock(ip, iolock);
+        return ret;
+}
-        XFS_STATS_ADD(xs_write_bytes, ret);
+STATIC long
+xfs_file_fallocate(
+        struct file     *file,
+        int             mode,
+        loff_t          offset,
+        loff_t          len)
+{
+        struct inode    *inode = file->f_path.dentry->d_inode;
+        long            error;
+        loff_t          new_size = 0;
+        xfs_flock64_t   bf;
+        xfs_inode_t     *ip = XFS_I(inode);
+        int             cmd = XFS_IOC_RESVSP;
-        /* Handle various SYNC-type writes */
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                return -EOPNOTSUPP;
-                loff_t end = pos + ret - 1;
-                int error2;
-                xfs_iunlock(ip, iolock);
+        bf.l_whence = 0;
-                if (need_i_mutex)
+        bf.l_start = offset;
-                        mutex_unlock(&inode->i_mutex);
+        bf.l_len = len;
-                error2 = filemap_write_and_wait_range(mapping, pos, end);
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-                if (!error)
-                        error = error2;
-                if (need_i_mutex)
-                        mutex_lock(&inode->i_mutex);
-                xfs_ilock(ip, iolock);
-                error2 = -xfs_file_fsync(file,
+        if (mode & FALLOC_FL_PUNCH_HOLE)
-                                         (file->f_flags & __O_SYNC) ? 0 : 1);
+                cmd = XFS_IOC_UNRESVSP;
-                if (!error)
-                        error = error2;
+        /* check the new inode size is valid before allocating */
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                error = inode_newsize_ok(inode, new_size);
+                if (error)
+                        goto out_unlock;
        }
- out_unlock_internal:
+        error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
-        if (ip->i_new_size) {
+        if (error)
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                goto out_unlock;
-                ip->i_new_size = 0;
-                /*
+        /* Change file size if needed */
-                 * If this was a direct or synchronous I/O that failed (such
+        if (new_size) {
-                 * as ENOSPC) then part of the I/O may have been written to
+                struct iattr iattr;
-                 * disk before the error occured.  In this case the on-disk
-                 * file size may have been adjusted beyond the in-memory file
+                iattr.ia_valid = ATTR_SIZE;
-                 * size and now needs to be truncated back.
+                iattr.ia_size = new_size;
-                 */
+                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
-        xfs_iunlock(ip, iolock);
- out_unlock_mutex:
+out_unlock:
-        if (need_i_mutex)
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                mutex_unlock(&inode->i_mutex);
+        return error;
-        return -error;
 }
 STATIC int
 xfs_file_open(
        struct inode    *inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
        .open           = xfs_file_open,
        .release        = xfs_file_release,
        .fsync          = xfs_file_fsync,
+        .fallocate      = xfs_file_fallocate,
 };
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1f279b012f94..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -32,10 +32,9 @@ xfs_tosspages(
        xfs_off_t       last,
        int             fiopt)
 {
-        struct address_space *mapping = VFS_I(ip)->i_mapping;
+        /* can't toss partial tail pages, so mask them out */
+        last &= ~(PAGE_SIZE - 1);
-        if (mapping->nrpages)
+        truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
-                truncate_inode_pages(mapping, first);
 }
 int
@@ -50,12 +49,11 @@ xfs_flushinval_pages(
        trace_xfs_pagecache_inval(ip, first, last);
-        if (mapping->nrpages) {
+        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        ret = filemap_write_and_wait_range(mapping, first,
-                ret = filemap_write_and_wait(mapping);
+                                last == -1 ? LLONG_MAX : last);
-                if (!ret)
+        if (!ret)
-                        truncate_inode_pages(mapping, first);
+                truncate_inode_pages_range(mapping, first, last);
-        }
        return -ret;
 }
@@ -71,10 +69,9 @@ xfs_flush_pages(
        int             ret = 0;
        int             ret2;
-        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        ret = -filemap_fdatawrite_range(mapping, first,
-                ret = -filemap_fdatawrite(mapping);
+                                last == -1 ? LLONG_MAX : last);
-        }
        if (flags & XBF_ASYNC)
                return ret;
        ret2 = xfs_wait_on_pages(ip, first, last);
@@ -91,7 +88,9 @@ xfs_wait_on_pages(
 {
        struct address_space *mapping = VFS_I(ip)->i_mapping;
-        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
-                return -filemap_fdatawait(mapping);
+                return -filemap_fdatawait_range(mapping, first,
+                                        last == -1 ? ip->i_size - 1 : last);
+        }
        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02e..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include "xfs.h"
-#include "xfs_cred.h"
 #include "xfs_sysctl.h"
 /*
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061c..000000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_GLOBALS_H__
-#define __XFS_GLOBALS_H__
-extern uint64_t xfs_panic_mask;         /* set to cause more panics */
-#endif  /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 3b9e626f7cd1..f5e2a19e0f8e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_discard.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
@@ -416,7 +417,7 @@ xfs_attrlist_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+        kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
        if (!kbuf)
                goto out_dput;
@@ -790,7 +791,7 @@ xfs_ioc_fsgetxattr(
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        fa.fsx_xflags = xfs_ip2xflags(ip);
        fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
-        fa.fsx_projid = ip->i_d.di_projid;
+        fa.fsx_projid = xfs_get_projid(ip);
        if (attr) {
                if (ip->i_afp) {
@@ -909,10 +910,10 @@ xfs_ioctl_setattr(
                return XFS_ERROR(EIO);
        /*
-         * Disallow 32bit project ids because on-disk structure
+         * Disallow 32bit project ids when projid32bit feature is not enabled.
-         * is 16bit only.
         */
-        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
+        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
+                        !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
                return XFS_ERROR(EINVAL);
        /*
@@ -961,7 +962,7 @@ xfs_ioctl_setattr(
        if (mask & FSX_PROJID) {
                if (XFS_IS_QUOTA_RUNNING(mp) &&
                    XFS_IS_PQUOTA_ON(mp) &&
-                    ip->i_d.di_projid != fa->fsx_projid) {
+                    xfs_get_projid(ip) != fa->fsx_projid) {
                        ASSERT(tp);
                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
                                                capable(CAP_FOWNER) ?
@@ -984,10 +985,22 @@ xfs_ioctl_setattr(
                /*
                 * Extent size must be a multiple of the appropriate block
-                 * size, if set at all.
+                 * size, if set at all. It must also be smaller than the
+                 * maximum extent size supported by the filesystem.
+                 *
+                 * Also, for non-realtime files, limit the extent size hint to
+                 * half the size of the AGs in the filesystem so alignment
+                 * doesn't result in extents larger than an AG.
                 */
                if (fa->fsx_extsize != 0) {
-                        xfs_extlen_t    size;
+                        xfs_extlen_t    size;
+                        xfs_fsblock_t   extsize_fsb;
+                        extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+                        if (extsize_fsb > MAXEXTLEN) {
+                                code = XFS_ERROR(EINVAL);
+                                goto error_return;
+                        }
                        if (XFS_IS_REALTIME_INODE(ip) ||
                            ((mask & FSX_XFLAGS) &&
@@ -996,6 +1009,10 @@ xfs_ioctl_setattr(
                                       mp->m_sb.sb_blocklog;
                        } else {
                                size = mp->m_sb.sb_blocksize;
+                                if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+                                        code = XFS_ERROR(EINVAL);
+                                        goto error_return;
+                                }
                        }
                        if (fa->fsx_extsize % size) {
@@ -1063,12 +1080,12 @@ xfs_ioctl_setattr(
                 * Change the ownerships and register quota modifications
                 * in the transaction.
                 */
-                if (ip->i_d.di_projid != fa->fsx_projid) {
+                if (xfs_get_projid(ip) != fa->fsx_projid) {
                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
                                olddquot = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_gdquot, gdqp);
                        }
-                        ip->i_d.di_projid = fa->fsx_projid;
+                        xfs_set_projid(ip, fa->fsx_projid);
                        /*
                         * We may have to rev the inode as well as
@@ -1088,8 +1105,8 @@ xfs_ioctl_setattr(
                xfs_diflags_to_linux(ip);
        }
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
        XFS_STATS_INC(xs_ig_attrchg);
@@ -1294,6 +1311,8 @@ xfs_file_ioctl(
        trace_xfs_file_ioctl(ip);
        switch (cmd) {
+        case FITRIM:
+                return xfs_ioc_trim(mp, arg);
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -1301,7 +1320,8 @@ xfs_file_ioctl(
        case XFS_IOC_ALLOCSP64:
        case XFS_IOC_FREESP64:
        case XFS_IOC_RESVSP64:
-        case XFS_IOC_UNRESVSP64: {
+        case XFS_IOC_UNRESVSP64:
+        case XFS_IOC_ZERO_RANGE: {
                xfs_flock64_t           bf;
                if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 6c83f7f62dc9..b3486dfa5520 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
            get_user(bstat->bs_extsize, &bstat32->bs_extsize)   ||
            get_user(bstat->bs_extents, &bstat32->bs_extents)   ||
            get_user(bstat->bs_gen,     &bstat32->bs_gen)       ||
-            get_user(bstat->bs_projid,  &bstat32->bs_projid)    ||
+            get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
+            get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
            get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
            get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
            get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
            put_user(buffer->bs_extents,  &p32->bs_extents)     ||
            put_user(buffer->bs_gen,      &p32->bs_gen)         ||
            put_user(buffer->bs_projid,   &p32->bs_projid)      ||
+            put_user(buffer->bs_projid_hi,      &p32->bs_projid_hi)     ||
            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
            put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
            put_user(buffer->bs_aextents, &p32->bs_aextents))
@@ -574,6 +576,7 @@ xfs_file_compat_ioctl(
        case XFS_IOC_FSGEOMETRY_V1:
        case XFS_IOC_FSGROWFSDATA:
        case XFS_IOC_FSGROWFSRT:
+        case XFS_IOC_ZERO_RANGE:
                return xfs_file_ioctl(filp, cmd, p);
 #else
        case XFS_IOC_ALLOCSP_32:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0d..08b605792a99 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
        __s32           bs_extsize;     /* extent size                  */
        __s32           bs_extents;     /* number of extents            */
        __u32           bs_gen;         /* generation count             */
-        __u16           bs_projid;      /* project id                   */
+        __u16           bs_projid_lo;   /* lower part of project id     */
-        unsigned char   bs_pad[14];     /* pad space, unused            */
+#define bs_projid       bs_projid_lo    /* (previously just bs_projid)  */
+        __u16           bs_projid_hi;   /* high part of project id      */
+        unsigned char   bs_pad[12];     /* pad space, unused            */
        __u32           bs_dmevmask;    /* DMIG event mask              */
        __u16           bs_dmstate;     /* DMIG state info              */
        __u16           bs_aextents;    /* attribute number of extents  */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b1fc2a6bfe83..bd5727852fd6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
 #include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/falloc.h>
 #include <linux/fiemap.h>
 #include <linux/slab.h>
@@ -95,41 +94,6 @@ xfs_mark_inode_dirty(
 }
 /*
- * Change the requested timestamp in the given inode.
- * We don't lock across timestamp updates, and we don't log them but
- * we do record the fact that there is dirty information in core.
- */
-void
-xfs_ichgtime(
-        xfs_inode_t     *ip,
-        int             flags)
-{
-        struct inode    *inode = VFS_I(ip);
-        timespec_t      tv;
-        int             sync_it = 0;
-        tv = current_fs_time(inode->i_sb);
-        if ((flags & XFS_ICHGTIME_MOD) &&
-            !timespec_equal(&inode->i_mtime, &tv)) {
-                inode->i_mtime = tv;
-                sync_it = 1;
-        }
-        if ((flags & XFS_ICHGTIME_CHG) &&
-            !timespec_equal(&inode->i_ctime, &tv)) {
-                inode->i_ctime = tv;
-                sync_it = 1;
-        }
-        /*
-         * Update complete - now make sure everyone knows that the inode
-         * is dirty.
-         */
-        if (sync_it)
-                xfs_mark_inode_dirty_sync(ip);
-}
-/*
 * Hook in SELinux.  This is not quite correct yet, what we really need
 * here (as we do for default ACLs) is a mechanism by which creation of
 * these attrs can be journalled at inode creation time (along with the
@@ -224,7 +188,7 @@ xfs_vn_mknod(
        }
        xfs_dentry_to_name(&name, dentry);
-        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
+        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
        if (unlikely(error))
                goto out_free_acl;
@@ -352,7 +316,7 @@ xfs_vn_link(
        if (unlikely(error))
                return -error;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -397,7 +361,7 @@ xfs_vn_symlink(
                (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
        xfs_dentry_to_name(&name, dentry);
-        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
+        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
        if (unlikely(error))
                goto out;
@@ -540,58 +504,6 @@ xfs_vn_setattr(
        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
-STATIC long
-xfs_vn_fallocate(
-        struct inode    *inode,
-        int             mode,
-        loff_t          offset,
-        loff_t          len)
-{
-        long            error;
-        loff_t          new_size = 0;
-        xfs_flock64_t   bf;
-        xfs_inode_t     *ip = XFS_I(inode);
-        /* preallocation on directories not yet supported */
-        error = -ENODEV;
-        if (S_ISDIR(inode->i_mode))
-                goto out_error;
-        bf.l_whence = 0;
-        bf.l_start = offset;
-        bf.l_len = len;
-        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        /* check the new inode size is valid before allocating */
-        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-            offset + len > i_size_read(inode)) {
-                new_size = offset + len;
-                error = inode_newsize_ok(inode, new_size);
-                if (error)
-                        goto out_unlock;
-        }
-        error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                       0, XFS_ATTR_NOLOCK);
-        if (error)
-                goto out_unlock;
-        /* Change file size if needed */
-        if (new_size) {
-                struct iattr iattr;
-                iattr.ia_valid = ATTR_SIZE;
-                iattr.ia_size = new_size;
-                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-        }
-out_unlock:
-        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
-        return error;
-}
 #define XFS_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 /*
@@ -685,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
        .getxattr               = generic_getxattr,
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
-        .fallocate              = xfs_vn_fallocate,
        .fiemap                 = xfs_vn_fiemap,
 };
@@ -795,7 +706,10 @@ xfs_setup_inode(
        inode->i_ino = ip->i_ino;
        inode->i_state = I_NEW;
-        inode_add_to_lists(ip->i_mount->m_super, inode);
+        inode_sb_list_add(inode);
+        /* make the inode look hashed for the writeback code */
+        hlist_add_fake(&inode->i_hash);
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 2fa0bd9ebc7f..096494997747 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
 #include <kmem.h>
 #include <mrlock.h>
-#include <sv.h>
 #include <time.h>
 #include <support/debug.h>
@@ -71,6 +70,7 @@
 #include <linux/random.h>
 #include <linux/ctype.h>
 #include <linux/writeback.h>
+#include <linux/capability.h>
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -79,14 +79,12 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
-#include <xfs_cred.h>
 #include <xfs_vnode.h>
 #include <xfs_stats.h>
 #include <xfs_sysctl.h>
 #include <xfs_iops.h>
 #include <xfs_aops.h>
 #include <xfs_super.h>
-#include <xfs_globals.h>
 #include <xfs_buf.h>
 /*
@@ -144,7 +142,7 @@
 #define SYNCHRONIZE()   barrier()
 #define __return_address __builtin_return_address(0)
-#define dfltprid        0
+#define XFS_PROJID_DEFAULT      0
 #define MAXPATHLEN      1024
 #define MIN(a,b)        (min(a,b))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e07974955b..9731898083ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -44,7 +44,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
-#include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
 #include "xfs_filestream.h"
@@ -354,9 +353,6 @@ xfs_parseargs(
                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
-                        cmn_err(CE_WARN,
-                                "Enabling EXPERIMENTAL delayed logging feature "
-                                "- use at your own risk.\n");
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
@@ -577,7 +573,7 @@ xfs_max_file_offset(
        /* Figure out maximum filesize, on Linux this can depend on
         * the filesystem blocksize (on 32 bit platforms).
-         * __block_prepare_write does this in an [unsigned] long...
+         * __block_write_begin does this in an [unsigned] long...
         *      page->index << (PAGE_CACHE_SHIFT - bbits)
         * So, for page sized blocks (4K on 32 bit platforms),
         * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -610,7 +606,8 @@ xfs_blkdev_get(
 {
        int                     error = 0;
-        *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
+        *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                    mp);
        if (IS_ERR(*bdevp)) {
                error = PTR_ERR(*bdevp);
                printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -624,7 +621,7 @@ xfs_blkdev_put(
        struct block_device     *bdev)
 {
        if (bdev)
-                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+                blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 /*
@@ -645,7 +642,7 @@ xfs_barrier_test(
        XFS_BUF_ORDERED(sbp);
        xfsbdstrat(mp, sbp);
-        error = xfs_iowait(sbp);
+        error = xfs_buf_iowait(sbp);
        /*
         * Clear all the flags we set and possible error state in the
@@ -693,8 +690,7 @@ void
 xfs_blkdev_issue_flush(
        xfs_buftarg_t           *buftarg)
 {
-        blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+        blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
 }
 STATIC void
@@ -758,18 +754,20 @@ xfs_open_devices(
         * Setup xfs_mount buffer target pointers
         */
        error = ENOMEM;
-        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
+        mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
        if (rtdev) {
-                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
+                mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
+                                                        mp->m_fsname);
                if (!mp->m_rtdev_targp)
                        goto out_free_ddev_targ;
        }
        if (logdev && logdev != ddev) {
-                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
+                mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
+                                                        mp->m_fsname);
                if (!mp->m_logdev_targp)
                        goto out_free_rtdev_targ;
        } else {
@@ -837,8 +835,11 @@ xfsaild_wakeup(
        struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-        ailp->xa_target = threshold_lsn;
+        /* only ever move the target forwards */
-        wake_up_process(ailp->xa_task);
+        if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
+                ailp->xa_target = threshold_lsn;
+                wake_up_process(ailp->xa_task);
+        }
 }
 STATIC int
@@ -850,8 +851,17 @@ xfsaild(
        long            tout = 0; /* milliseconds */
        while (!kthread_should_stop()) {
-                schedule_timeout_interruptible(tout ?
+                /*
-                                msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+                 * for short sleeps indicating congestion, don't allow us to
+                 * get woken early. Otherwise all we do is bang on the AIL lock
+                 * without making progress.
+                 */
+                if (tout && tout <= 20)
+                        __set_current_state(TASK_KILLABLE);
+                else
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(tout ?
+                                 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
                /* swsusp */
                try_to_freeze();
@@ -938,7 +948,7 @@ out_reclaim:
 * Slab object creation initialisation for the XFS inode.
 * This covers only the idempotent fields in the XFS inode;
 * all other fields need to be initialised on allocation
- * from the slab. This avoids the need to repeatedly intialise
+ * from the slab. This avoids the need to repeatedly initialise
 * fields in the xfs inode that left in the initialise state
 * when freeing the inode.
 */
@@ -972,12 +982,7 @@ xfs_fs_inode_init_once(
 /*
 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
- * we catch unlogged VFS level updates to the inode. Care must be taken
+ * we catch unlogged VFS level updates to the inode.
- * here - the transaction code calls mark_inode_dirty_sync() to mark the
- * VFS inode dirty in a transaction and clears the i_update_core field;
- * it must clear the field after calling mark_inode_dirty_sync() to
- * correctly indicate that the dirty state has been propagated into the
- * inode log item.
 *
 * We need the barrier() to maintain correct ordering between unlogged
 * updates and the transaction commit code that clears the i_update_core
@@ -1126,6 +1131,8 @@ xfs_fs_evict_inode(
         */
        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
        xfs_inactive(ip);
 }
@@ -1407,7 +1414,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+        return -xfs_fs_log_dummy(mp);
 }
 STATIC int
@@ -1521,8 +1528,9 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_fsname;
-        if (xfs_icsb_init_counters(mp))
+        error = xfs_icsb_init_counters(mp);
-                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
+        if (error)
+                goto out_close_devices;
        error = xfs_readsb(mp, flags);
        if (error)
@@ -1583,6 +1591,7 @@ xfs_fs_fill_super(
        xfs_freesb(mp);
 out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
+ out_close_devices:
        xfs_close_devices(mp);
 out_free_fsname:
        xfs_free_fsname(mp);
@@ -1612,16 +1621,14 @@ xfs_fs_fill_super(
        goto out_free_sb;
 }
-STATIC int
+STATIC struct dentry *
-xfs_fs_get_sb(
+xfs_fs_mount(
        struct file_system_type *fs_type,
        int                     flags,
        const char              *dev_name,
-        void                    *data,
+        void                    *data)
-        struct vfsmount         *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
-                           mnt);
 }
 static const struct super_operations xfs_super_operations = {
@@ -1642,7 +1649,7 @@ static const struct super_operations xfs_super_operations = {
 static struct file_system_type xfs_fs_type = {
        .owner                  = THIS_MODULE,
        .name                   = "xfs",
-        .get_sb                 = xfs_fs_get_sb,
+        .mount                  = xfs_fs_mount,
        .kill_sb                = kill_block_super,
        .fs_flags               = FS_REQUIRES_DEV,
 };
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 1ef4a4d2d997..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -62,6 +62,7 @@ extern void xfs_qm_exit(void);
 # define XFS_DBG_STRING         "no debug"
 #endif
+#define XFS_VERSION_STRING      "SGI XFS"
 #define XFS_BUILD_OPTIONS       XFS_ACL_STRING \
                                XFS_SECURITY_STRING \
                                XFS_REALTIME_STRING \
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 81976ffed7d6..e22f0057d21f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -39,42 +39,59 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH        32
-STATIC xfs_inode_t *
+STATIC int
-xfs_inode_ag_lookup(
+xfs_inode_ag_walk_grab(
-        struct xfs_mount        *mp,
+        struct xfs_inode        *ip)
-        struct xfs_perag        *pag,
-        uint32_t                *first_index,
-        int                     tag)
 {
-        int                     nr_found;
+        struct inode            *inode = VFS_I(ip);
-        struct xfs_inode        *ip;
+        ASSERT(rcu_read_lock_held());
        /*
-         * use a gang lookup to find the next inode in the tree
+         * check for stale RCU freed inode
-         * as the tree is sparse and a gang lookup walks to find
+         *
-         * the number of objects requested.
+         * If the inode has been reallocated, it doesn't matter if it's not in
+         * the AG we are walking - we are walking for writeback, so if it
+         * passes all the "valid inode" checks and is dirty, then we'll write
+         * it back anyway.  If it has been reallocated and still being
+         * initialised, the XFS_INEW check below will catch it.
         */
-        if (tag == XFS_ICI_NO_TAG) {
+        spin_lock(&ip->i_flags_lock);
-                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+        if (!ip->i_ino)
-                                (void **)&ip, *first_index, 1);
+                goto out_unlock_noent;
-        } else {
-                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-                                (void **)&ip, *first_index, 1, tag);
+        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+                goto out_unlock_noent;
+        spin_unlock(&ip->i_flags_lock);
+        /* nothing to sync during shutdown */
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return EFSCORRUPTED;
+        /* If we can't grab the inode, it must on it's way to reclaim. */
+        if (!igrab(inode))
+                return ENOENT;
+        if (is_bad_inode(inode)) {
+                IRELE(ip);
+                return ENOENT;
        }
-        if (!nr_found)
-                return NULL;
-        /*
+        /* inode is valid */
-         * Update the index for the next lookup. Catch overflows
+        return 0;
-         * into the next AG range which can occur if we have inodes
-         * in the last block of the AG and we are currently
+out_unlock_noent:
-         * pointing to the last inode.
+        spin_unlock(&ip->i_flags_lock);
-         */
+        return ENOENT;
-        *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-        if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-                return NULL;
-        return ip;
 }
 STATIC int
@@ -83,49 +100,83 @@ xfs_inode_ag_walk(
        struct xfs_perag        *pag,
        int                     (*execute)(struct xfs_inode *ip,
                                           struct xfs_perag *pag, int flags),
-        int                     flags,
+        int                     flags)
-        int                     tag,
-        int                     exclusive,
-        int                     *nr_to_scan)
 {
        uint32_t                first_index;
        int                     last_error = 0;
        int                     skipped;
+        int                     done;
+        int                     nr_found;
 restart:
+        done = 0;
        skipped = 0;
        first_index = 0;
+        nr_found = 0;
        do {
+                struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                int             error = 0;
-                xfs_inode_t     *ip;
+                int             i;
-                if (exclusive)
+                rcu_read_lock();
-                        write_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-                else
+                                        (void **)batch, first_index,
-                        read_lock(&pag->pag_ici_lock);
+                                        XFS_LOOKUP_BATCH);
-                ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
+                if (!nr_found) {
-                if (!ip) {
+                        rcu_read_unlock();
-                        if (exclusive)
-                                write_unlock(&pag->pag_ici_lock);
-                        else
-                                read_unlock(&pag->pag_ici_lock);
                        break;
                }
-                /* execute releases pag->pag_ici_lock */
+                /*
-                error = execute(ip, pag, flags);
+                 * Grab the inodes before we drop the lock. if we found
-                if (error == EAGAIN) {
+                 * nothing, nr == 0 and the loop will be skipped.
-                        skipped++;
+                 */
-                        continue;
+                for (i = 0; i < nr_found; i++) {
+                        struct xfs_inode *ip = batch[i];
+                        if (done || xfs_inode_ag_walk_grab(ip))
+                                batch[i] = NULL;
+                        /*
+                         * Update the index for the next lookup. Catch
+                         * overflows into the next AG range which can occur if
+                         * we have inodes in the last block of the AG and we
+                         * are currently pointing to the last inode.
+                         *
+                         * Because we may see inodes that are from the wrong AG
+                         * due to RCU freeing and reallocation, only update the
+                         * index if it lies in this AG. It was a race that lead
+                         * us to see this inode, so another lookup from the
+                         * same index will not find it again.
+                         */
+                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                                continue;
+                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                                done = 1;
+                }
+                /* unlock now we've grabbed the inodes. */
+                rcu_read_unlock();
+                for (i = 0; i < nr_found; i++) {
+                        if (!batch[i])
+                                continue;
+                        error = execute(batch[i], pag, flags);
+                        IRELE(batch[i]);
+                        if (error == EAGAIN) {
+                                skipped++;
+                                continue;
+                        }
+                        if (error && last_error != EFSCORRUPTED)
+                                last_error = error;
                }
-                if (error)
-                        last_error = error;
                /* bail out if the filesystem is corrupted.  */
                if (error == EFSCORRUPTED)
                        break;
-        } while ((*nr_to_scan)--);
+        } while (nr_found && !done);
        if (skipped) {
                delay(1);
@@ -134,110 +185,32 @@ restart:
        return last_error;
 }
-/*
- * Select the next per-ag structure to iterate during the walk. The reclaim
- * walk is optimised only to walk AGs with reclaimable inodes in them.
- */
-static struct xfs_perag *
-xfs_inode_ag_iter_next_pag(
-        struct xfs_mount        *mp,
-        xfs_agnumber_t          *first,
-        int                     tag)
-{
-        struct xfs_perag        *pag = NULL;
-        if (tag == XFS_ICI_RECLAIM_TAG) {
-                int found;
-                int ref;
-                spin_lock(&mp->m_perag_lock);
-                found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-                                (void **)&pag, *first, 1, tag);
-                if (found <= 0) {
-                        spin_unlock(&mp->m_perag_lock);
-                        return NULL;
-                }
-                *first = pag->pag_agno + 1;
-                /* open coded pag reference increment */
-                ref = atomic_inc_return(&pag->pag_ref);
-                spin_unlock(&mp->m_perag_lock);
-                trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
-        } else {
-                pag = xfs_perag_get(mp, *first);
-                (*first)++;
-        }
-        return pag;
-}
 int
 xfs_inode_ag_iterator(
        struct xfs_mount        *mp,
        int                     (*execute)(struct xfs_inode *ip,
                                           struct xfs_perag *pag, int flags),
-        int                     flags,
+        int                     flags)
-        int                     tag,
-        int                     exclusive,
-        int                     *nr_to_scan)
 {
        struct xfs_perag        *pag;
        int                     error = 0;
        int                     last_error = 0;
        xfs_agnumber_t          ag;
-        int                     nr;
-        nr = nr_to_scan ? *nr_to_scan : INT_MAX;
        ag = 0;
-        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
+        while ((pag = xfs_perag_get(mp, ag))) {
-                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
+                ag = pag->pag_agno + 1;
-                                                exclusive, &nr);
+                error = xfs_inode_ag_walk(mp, pag, execute, flags);
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
                        if (error == EFSCORRUPTED)
                                break;
                }
-                if (nr <= 0)
-                        break;
        }
-        if (nr_to_scan)
-                *nr_to_scan = nr;
        return XFS_ERROR(last_error);
 }
-/* must be called with pag_ici_lock held and releases it */
-int
-xfs_sync_inode_valid(
-        struct xfs_inode        *ip,
-        struct xfs_perag        *pag)
-{
-        struct inode            *inode = VFS_I(ip);
-        int                     error = EFSCORRUPTED;
-        /* nothing to sync during shutdown */
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                goto out_unlock;
-        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        error = ENOENT;
-        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                goto out_unlock;
-        /* If we can't grab the inode, it must on it's way to reclaim. */
-        if (!igrab(inode))
-                goto out_unlock;
-        if (is_bad_inode(inode)) {
-                IRELE(ip);
-                goto out_unlock;
-        }
-        /* inode is valid */
-        error = 0;
-out_unlock:
-        read_unlock(&pag->pag_ici_lock);
-        return error;
-}
 STATIC int
 xfs_sync_inode_data(
        struct xfs_inode        *ip,
@@ -248,10 +221,6 @@ xfs_sync_inode_data(
        struct address_space *mapping = inode->i_mapping;
        int                     error = 0;
-        error = xfs_sync_inode_valid(ip, pag);
-        if (error)
-                return error;
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                goto out_wait;
@@ -268,7 +237,6 @@ xfs_sync_inode_data(
 out_wait:
        if (flags & SYNC_WAIT)
                xfs_ioend_wait(ip);
-        IRELE(ip);
        return error;
 }
@@ -280,10 +248,6 @@ xfs_sync_inode_attr(
 {
        int                     error = 0;
-        error = xfs_sync_inode_valid(ip, pag);
-        if (error)
-                return error;
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        if (xfs_inode_clean(ip))
                goto out_unlock;
@@ -302,7 +266,6 @@ xfs_sync_inode_attr(
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        IRELE(ip);
        return error;
 }
@@ -318,8 +281,7 @@ xfs_sync_data(
        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
+        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-                                      XFS_ICI_NO_TAG, 0, NULL);
        if (error)
                return XFS_ERROR(error);
@@ -337,8 +299,7 @@ xfs_sync_attr(
 {
        ASSERT((flags & ~SYNC_WAIT) == 0);
-        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
+        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
-                                     XFS_ICI_NO_TAG, 0, NULL);
 }
 STATIC int
@@ -401,7 +362,7 @@ xfs_quiesce_data(
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-                error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+                error2 = xfs_fs_log_dummy(mp);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -542,13 +503,14 @@ xfs_sync_worker(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
-                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                if (mp->m_super->s_frozen == SB_UNFROZEN &&
                    xfs_log_need_covered(mp))
-                        error = xfs_fs_log_dummy(mp, 0);
+                        error = xfs_fs_log_dummy(mp);
+                else
+                        xfs_log_force(mp, 0);
+                xfs_reclaim_inodes(mp, 0);
+                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
@@ -659,12 +621,12 @@ xfs_inode_set_reclaim_tag(
        struct xfs_perag *pag;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        xfs_perag_put(pag);
 }
@@ -698,6 +660,53 @@ __xfs_inode_clear_reclaim_tag(
 }
 /*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+        struct xfs_inode        *ip,
+        int                     flags)
+{
+        ASSERT(rcu_read_lock_held());
+        /* quick check for stale RCU freed inode */
+        if (!ip->i_ino)
+                return 1;
+        /*
+         * do some unlocked checks first to avoid unnecessary lock traffic.
+         * The first is a flush lock check, the second is a already in reclaim
+         * check. Only do these checks if we are not going to block on locks.
+         */
+        if ((flags & SYNC_TRYLOCK) &&
+            (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+                return 1;
+        }
+        /*
+         * The radix tree lock here protects a thread in xfs_iget from racing
+         * with us starting reclaim on the inode.  Once we have the
+         * XFS_IRECLAIM flag set it will not touch us.
+         *
+         * Due to RCU lookup, we may find inodes that have been freed and only
+         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+         * aren't candidates for reclaim at all, so we must check the
+         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                /* not a reclaim candidate. */
+                spin_unlock(&ip->i_flags_lock);
+                return 1;
+        }
+        __xfs_iflags_set(ip, XFS_IRECLAIM);
+        spin_unlock(&ip->i_flags_lock);
+        return 0;
+}
+/*
 * Inodes in different states need to be treated differently, and the return
 * value of xfs_iflush is not sufficient to get this right. The following table
 * lists the inode states and the reclaim actions necessary for non-blocking
@@ -755,23 +764,6 @@ xfs_reclaim_inode(
 {
        int     error = 0;
-        /*
-         * The radix tree lock here protects a thread in xfs_iget from racing
-         * with us starting reclaim on the inode.  Once we have the
-         * XFS_IRECLAIM flag set it will not touch us.
-         */
-        spin_lock(&ip->i_flags_lock);
-        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                /* ignore as it is already under reclaim */
-                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
-                return 0;
-        }
-        __xfs_iflags_set(ip, XFS_IRECLAIM);
-        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (!xfs_iflock_nowait(ip)) {
                if (!(sync_mode & SYNC_WAIT))
@@ -842,12 +834,12 @@ reclaim:
         * added to the tree assert that it's been there before to catch
         * problems with the inode life time early on.
         */
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
        __xfs_inode_clear_reclaim(pag, ip);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -868,13 +860,137 @@ reclaim:
 }
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+        struct xfs_mount        *mp,
+        int                     flags,
+        int                     *nr_to_scan)
+{
+        struct xfs_perag        *pag;
+        int                     error = 0;
+        int                     last_error = 0;
+        xfs_agnumber_t          ag;
+        int                     trylock = flags & SYNC_TRYLOCK;
+        int                     skipped;
+restart:
+        ag = 0;
+        skipped = 0;
+        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+                unsigned long   first_index = 0;
+                int             done = 0;
+                int             nr_found = 0;
+                ag = pag->pag_agno + 1;
+                if (trylock) {
+                        if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+                                skipped++;
+                                xfs_perag_put(pag);
+                                continue;
+                        }
+                        first_index = pag->pag_ici_reclaim_cursor;
+                } else
+                        mutex_lock(&pag->pag_ici_reclaim_lock);
+                do {
+                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+                        int     i;
+                        rcu_read_lock();
+                        nr_found = radix_tree_gang_lookup_tag(
+                                        &pag->pag_ici_root,
+                                        (void **)batch, first_index,
+                                        XFS_LOOKUP_BATCH,
+                                        XFS_ICI_RECLAIM_TAG);
+                        if (!nr_found) {
+                                rcu_read_unlock();
+                                break;
+                        }
+                        /*
+                         * Grab the inodes before we drop the lock. if we found
+                         * nothing, nr == 0 and the loop will be skipped.
+                         */
+                        for (i = 0; i < nr_found; i++) {
+                                struct xfs_inode *ip = batch[i];
+                                if (done || xfs_reclaim_inode_grab(ip, flags))
+                                        batch[i] = NULL;
+                                /*
+                                 * Update the index for the next lookup. Catch
+                                 * overflows into the next AG range which can
+                                 * occur if we have inodes in the last block of
+                                 * the AG and we are currently pointing to the
+                                 * last inode.
+                                 *
+                                 * Because we may see inodes that are from the
+                                 * wrong AG due to RCU freeing and
+                                 * reallocation, only update the index if it
+                                 * lies in this AG. It was a race that lead us
+                                 * to see this inode, so another lookup from
+                                 * the same index will not find it again.
+                                 */
+                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                                pag->pag_agno)
+                                        continue;
+                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                                        done = 1;
+                        }
+                        /* unlock now we've grabbed the inodes. */
+                        rcu_read_unlock();
+                        for (i = 0; i < nr_found; i++) {
+                                if (!batch[i])
+                                        continue;
+                                error = xfs_reclaim_inode(batch[i], pag, flags);
+                                if (error && last_error != EFSCORRUPTED)
+                                        last_error = error;
+                        }
+                        *nr_to_scan -= XFS_LOOKUP_BATCH;
+                } while (nr_found && !done && *nr_to_scan > 0);
+                if (trylock && !done)
+                        pag->pag_ici_reclaim_cursor = first_index;
+                else
+                        pag->pag_ici_reclaim_cursor = 0;
+                mutex_unlock(&pag->pag_ici_reclaim_lock);
+                xfs_perag_put(pag);
+        }
+        /*
+         * if we skipped any AG, and we still have scan count remaining, do
+         * another pass this time using blocking reclaim semantics (i.e
+         * waiting on the reclaim locks and ignoring the reclaim cursors). This
+         * ensure that when we get more reclaimers than AGs we block rather
+         * than spin trying to execute reclaim.
+         */
+        if (trylock && skipped && *nr_to_scan > 0) {
+                trylock = 0;
+                goto restart;
+        }
+        return XFS_ERROR(last_error);
+}
 int
 xfs_reclaim_inodes(
        xfs_mount_t     *mp,
        int             mode)
 {
-        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
+        int             nr_to_scan = INT_MAX;
-                                        XFS_ICI_RECLAIM_TAG, 1, NULL);
+        return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
 }
 /*
@@ -896,17 +1012,16 @@ xfs_reclaim_inode_shrink(
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
-                                        XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
+                /* terminate if we don't exhaust the scan */
-                /* if we don't exhaust the scan, don't bother coming back */
                if (nr_to_scan > 0)
                        return -1;
       }
        reclaimable = 0;
        ag = 0;
-        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
+        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-                                        XFS_ICI_RECLAIM_TAG))) {
+                ag = pag->pag_agno + 1;
                reclaimable += pag->pag_ici_reclaimable;
                xfs_perag_put(pag);
        }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index fe78726196f8..32ba6628290c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -47,10 +47,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
                                struct xfs_inode *ip);
-int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-        int flags, int tag, int write_lock, int *nr_to_scan);
+        int flags);
 void xfs_inode_shrinker_register(struct xfs_mount *mp);
 void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee3cee097e7e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include "xfs_error.h"
 static struct ctl_table_header *xfs_table_header;
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
        return ret;
 }
+STATIC int
+xfs_panic_mask_proc_handler(
+        ctl_table       *ctl,
+        int             write,
+        void            __user *buffer,
+        size_t          *lenp,
+        loff_t          *ppos)
+{
+        int             ret, *valp = ctl->data;
+        ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+        if (!ret && write) {
+                xfs_panic_mask = *valp;
+#ifdef DEBUG
+                xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+        }
+        return ret;
+}
 #endif /* CONFIG_PROC_FS */
 static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
                .data           = &xfs_params.panic_mask.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = xfs_panic_mask_proc_handler,
                .extra1         = &xfs_params.panic_mask.min,
                .extra2         = &xfs_params.panic_mask.max
        },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name,	\
                 unsigned long caller_ip),                                      \
        TP_ARGS(mp, agno, refcount, caller_ip))
 DEFINE_PERAG_REF_EVENT(xfs_perag_get);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
@@ -325,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_cond_lock);
 DEFINE_BUF_EVENT(xfs_buf_unlock);
-DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
 DEFINE_BUF_EVENT(xfs_buf_iowait);
 DEFINE_BUF_EVENT(xfs_buf_iowait_done);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_split);
-DEFINE_BUF_EVENT(xfs_buf_get_noaddr);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone);
@@ -767,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __field(int, curr_res)
                __field(int, unit_res)
                __field(unsigned int, flags)
-                __field(void *, reserve_headq)
+                __field(int, reserveq)
-                __field(void *, write_headq)
+                __field(int, writeq)
                __field(int, grant_reserve_cycle)
                __field(int, grant_reserve_bytes)
                __field(int, grant_write_cycle)
@@ -785,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-                __entry->reserve_headq = log->l_reserve_headq;
+                __entry->reserveq = list_empty(&log->l_reserveq);
-                __entry->write_headq = log->l_write_headq;
+                __entry->writeq = list_empty(&log->l_writeq);
-                __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+                xlog_crack_grant_head(&log->l_grant_reserve_head,
-                __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+                                &__entry->grant_reserve_cycle,
-                __entry->grant_write_cycle = log->l_grant_write_cycle;
+                                &__entry->grant_reserve_bytes);
-                __entry->grant_write_bytes = log->l_grant_write_bytes;
+                xlog_crack_grant_head(&log->l_grant_write_head,
+                                &__entry->grant_write_cycle,
+                                &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
                __entry->curr_block = log->l_curr_block;
-                __entry->tail_lsn = log->l_tail_lsn;
+                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-                  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+                  "t_unit_res %u t_flags %s reserveq %s "
-                  "write_headq 0x%p grant_reserve_cycle %d "
+                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
@@ -808,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                  __entry->curr_res,
                  __entry->unit_res,
                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-                  __entry->reserve_headq,
+                  __entry->reserveq ? "empty" : "active",
-                  __entry->write_headq,
+                  __entry->writeq ? "empty" : "active",
                  __entry->grant_reserve_cycle,
                  __entry->grant_reserve_bytes,
                  __entry->grant_write_cycle,
@@ -836,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -843,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -936,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                 int flags, struct xfs_bmbt_irec *irec),
+                 int type, struct xfs_bmbt_irec *irec),
-        TP_ARGS(ip, offset, count, flags, irec),
+        TP_ARGS(ip, offset, count, type, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
@@ -947,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
-                __field(int, flags)
+                __field(int, type)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -959,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
-                __entry->flags = flags;
+                __entry->type = type;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                  "offset 0x%llx count %zd flags %s "
+                  "offset 0x%llx count %zd type %s "
                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
@@ -973,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+                  __print_symbolic(__entry->type, XFS_IO_TYPES),
                  __entry->startoff,
                  (__int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 #define DEFINE_IOMAP_EVENT(name)        \
-DEFINE_EVENT(xfs_iomap_class, name,     \
+DEFINE_EVENT(xfs_imap_class, name,      \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                 int flags, struct xfs_bmbt_irec *irec),                \
+                 int type, struct xfs_bmbt_irec *irec),         \
-        TP_ARGS(ip, offset, count, flags, irec))
+        TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1023,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 TRACE_EVENT(xfs_itruncate_start,
@@ -1421,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
        TP_PROTO(struct xfs_alloc_arg *args), \
        TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
@@ -1753,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_discard_class,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                 xfs_agblock_t agbno, xfs_extlen_t len),
+        TP_ARGS(mp, agno, agbno, len),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(xfs_agblock_t, agbno)
+                __field(xfs_extlen_t, len)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->agbno = agbno;
+                __entry->len = len;
+        ),
+        TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->agbno,
+                  __entry->len)
+)
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                 xfs_agblock_t agbno, xfs_extlen_t len), \
+        TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563a..000000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VERSION_H__
-#define __XFS_VERSION_H__
-/*
- * Dummy file that can contain a timestamp to put into the
- * XFS init string, to help users keep track of what they're
- * running
- */
-#define XFS_VERSION_STRING "SGI XFS"
-#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e1a2f6800e01..d22aa3103106 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
-        sv_destroy(&dqp->q_pinwait);
        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
        atomic_dec(&xfs_Gqm->qm_totaldquots);
@@ -463,87 +462,68 @@ xfs_qm_dqtobp(
        uint                    flags)
 {
        xfs_bmbt_irec_t map;
-        int             nmaps, error;
+        int             nmaps = 1, error;
        xfs_buf_t       *bp;
-        xfs_inode_t     *quotip;
+        xfs_inode_t     *quotip = XFS_DQ_TO_QIP(dqp);
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = dqp->q_mount;
        xfs_disk_dquot_t *ddq;
-        xfs_dqid_t      id;
+        xfs_dqid_t      id = be32_to_cpu(dqp->q_core.d_id);
-        boolean_t       newdquot;
        xfs_trans_t     *tp = (tpp ? *tpp : NULL);
-        mp = dqp->q_mount;
+        dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
-        id = be32_to_cpu(dqp->q_core.d_id);
-        nmaps = 1;
-        newdquot = B_FALSE;
-        /*
+        xfs_ilock(quotip, XFS_ILOCK_SHARED);
-         * If we don't know where the dquot lives, find out.
+        if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
-         */
-        if (dqp->q_blkno == (xfs_daddr_t) 0) {
-                /* We use the id as an index */
-                dqp->q_fileoffset = (xfs_fileoff_t)id /
-                                        mp->m_quotainfo->qi_dqperchunk;
-                nmaps = 1;
-                quotip = XFS_DQ_TO_QIP(dqp);
-                xfs_ilock(quotip, XFS_ILOCK_SHARED);
                /*
-                 * Return if this type of quotas is turned off while we didn't
+                 * Return if this type of quotas is turned off while we
-                 * have an inode lock
+                 * didn't have the quota inode lock.
                 */
-                if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
-                        xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+                return ESRCH;
-                        return (ESRCH);
+        }
-                }
+        /*
+         * Find the block map; no allocations yet
+         */
+        error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+                          XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+                          NULL, 0, &map, &nmaps, NULL);
+        xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+        if (error)
+                return error;
+        ASSERT(nmaps == 1);
+        ASSERT(map.br_blockcount == 1);
+        /*
+         * Offset of dquot in the (fixed sized) dquot chunk.
+         */
+        dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+                sizeof(xfs_dqblk_t);
+        ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+        if (map.br_startblock == HOLESTARTBLOCK) {
                /*
-                 * Find the block map; no allocations yet
+                 * We don't allocate unless we're asked to
                 */
-                error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+                if (!(flags & XFS_QMOPT_DQALLOC))
-                                  XFS_DQUOT_CLUSTER_SIZE_FSB,
+                        return ENOENT;
-                                  XFS_BMAPI_METADATA,
-                                  NULL, 0, &map, &nmaps, NULL);
-                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+                ASSERT(tp);
+                error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
+                                        dqp->q_fileoffset, &bp);
                if (error)
-                        return (error);
+                        return error;
-                ASSERT(nmaps == 1);
+                tp = *tpp;
-                ASSERT(map.br_blockcount == 1);
+        } else {
+                trace_xfs_dqtobp_read(dqp);
                /*
-                 * offset of dquot in the (fixed sized) dquot chunk.
+                 * store the blkno etc so that we don't have to do the
+                 * mapping all the time
                 */
-                dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+                dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-                        sizeof(xfs_dqblk_t);
-                if (map.br_startblock == HOLESTARTBLOCK) {
-                        /*
-                         * We don't allocate unless we're asked to
-                         */
-                        if (!(flags & XFS_QMOPT_DQALLOC))
-                                return (ENOENT);
-                        ASSERT(tp);
-                        if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
-                                                dqp->q_fileoffset, &bp)))
-                                return (error);
-                        tp = *tpp;
-                        newdquot = B_TRUE;
-                } else {
-                        /*
-                         * store the blkno etc so that we don't have to do the
-                         * mapping all the time
-                         */
-                        dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-                }
-        }
-        ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
-        ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
-        /*
-         * Read in the buffer, unless we've just done the allocation
-         * (in which case we already have the buf).
-         */
-        if (!newdquot) {
-                trace_xfs_dqtobp_read(dqp);
                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                           dqp->q_blkno,
@@ -552,13 +532,14 @@ xfs_qm_dqtobp(
                if (error || !bp)
                        return XFS_ERROR(error);
        }
        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
        /*
         * calculate the location of the dquot inside the buffer.
         */
-        ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
        /*
         * A simple sanity check in case we got a corrupted dquot...
@@ -1176,18 +1157,18 @@ xfs_qm_dqflush(
        xfs_dquot_t             *dqp,
        uint                    flags)
 {
-        xfs_mount_t             *mp;
+        struct xfs_mount        *mp = dqp->q_mount;
-        xfs_buf_t               *bp;
+        struct xfs_buf          *bp;
-        xfs_disk_dquot_t        *ddqp;
+        struct xfs_disk_dquot   *ddqp;
        int                     error;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        ASSERT(!completion_done(&dqp->q_flush));
        trace_xfs_dqflush(dqp);
        /*
-         * If not dirty, or it's pinned and we are not supposed to
+         * If not dirty, or it's pinned and we are not supposed to block, nada.
-         * block, nada.
         */
        if (!XFS_DQ_IS_DIRTY(dqp) ||
            (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
@@ -1201,40 +1182,46 @@ xfs_qm_dqflush(
         * down forcibly. If that's the case we must not write this dquot
         * to disk, because the log record didn't make it to disk!
         */
-        if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+                dqp->dq_flags &= ~XFS_DQ_DIRTY;
                xfs_dqfunlock(dqp);
                return XFS_ERROR(EIO);
        }
        /*
         * Get the buffer containing the on-disk dquot
-         * We don't need a transaction envelope because we know that the
-         * the ondisk-dquot has already been allocated for.
         */
-        if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
+        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+                                   mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+        if (error) {
                ASSERT(error != ENOENT);
-                /*
-                 * Quotas could have gotten turned off (ESRCH)
-                 */
                xfs_dqfunlock(dqp);
-                return (error);
+                return error;
        }
-        if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id),
+        /*
-                           0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+         * Calculate the location of the dquot inside the buffer.
-                xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE);
+         */
+        ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        /*
+         * A simple sanity check in case we got a corrupted dquot..
+         */
+        if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+                           XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+                xfs_buf_relse(bp);
+                xfs_dqfunlock(dqp);
+                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                return XFS_ERROR(EIO);
        }
        /* This is the only portion of data that needs to persist */
-        memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t));
+        memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
        /*
         * Clear the dirty field and remember the flush lsn for later use.
         */
-        dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+        dqp->dq_flags &= ~XFS_DQ_DIRTY;
-        mp = dqp->q_mount;
        xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
                                        &dqp->q_logitem.qli_item.li_lsn);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9a92407109a1..206a2815ced6 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,8 +55,6 @@ uint		ndquot;
 kmem_zone_t     *qm_dqzone;
 kmem_zone_t     *qm_dqtrxzone;
-static cred_t   xfs_zerocr;
 STATIC void     xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void     xfs_qm_list_destroy(xfs_dqlist_t *);
@@ -837,7 +835,7 @@ xfs_qm_dqattach_locked(
                        xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
                                                flags & XFS_QMOPT_DQALLOC,
                                                ip->i_udquot, &ip->i_gdquot) :
-                        xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
+                        xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
                                                flags & XFS_QMOPT_DQALLOC,
                                                ip->i_udquot, &ip->i_gdquot);
                /*
@@ -1199,87 +1197,6 @@ xfs_qm_list_destroy(
        mutex_destroy(&(list->qh_lock));
 }
-/*
- * Stripped down version of dqattach. This doesn't attach, or even look at the
- * dquots attached to the inode. The rationale is that there won't be any
- * attached at the time this is called from quotacheck.
- */
-STATIC int
-xfs_qm_dqget_noattach(
-        xfs_inode_t     *ip,
-        xfs_dquot_t     **O_udqpp,
-        xfs_dquot_t     **O_gdqpp)
-{
-        int             error;
-        xfs_mount_t     *mp;
-        xfs_dquot_t     *udqp, *gdqp;
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        mp = ip->i_mount;
-        udqp = NULL;
-        gdqp = NULL;
-        if (XFS_IS_UQUOTA_ON(mp)) {
-                ASSERT(ip->i_udquot == NULL);
-                /*
-                 * We want the dquot allocated if it doesn't exist.
-                 */
-                if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
-                                         XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
-                                         &udqp))) {
-                        /*
-                         * Shouldn't be able to turn off quotas here.
-                         */
-                        ASSERT(error != ESRCH);
-                        ASSERT(error != ENOENT);
-                        return error;
-                }
-                ASSERT(udqp);
-        }
-        if (XFS_IS_OQUOTA_ON(mp)) {
-                ASSERT(ip->i_gdquot == NULL);
-                if (udqp)
-                        xfs_dqunlock(udqp);
-                error = XFS_IS_GQUOTA_ON(mp) ?
-                                xfs_qm_dqget(mp, ip,
-                                             ip->i_d.di_gid, XFS_DQ_GROUP,
-                                             XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-                                             &gdqp) :
-                                xfs_qm_dqget(mp, ip,
-                                             ip->i_d.di_projid, XFS_DQ_PROJ,
-                                             XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-                                             &gdqp);
-                if (error) {
-                        if (udqp)
-                                xfs_qm_dqrele(udqp);
-                        ASSERT(error != ESRCH);
-                        ASSERT(error != ENOENT);
-                        return error;
-                }
-                ASSERT(gdqp);
-                /* Reacquire the locks in the right order */
-                if (udqp) {
-                        if (! xfs_qm_dqlock_nowait(udqp)) {
-                                xfs_dqunlock(gdqp);
-                                xfs_dqlock(udqp);
-                                xfs_dqlock(gdqp);
-                        }
-                }
-        }
-        *O_udqpp = udqp;
-        *O_gdqpp = gdqp;
-#ifdef QUOTADEBUG
-        if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
-        if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
-#endif
-        return 0;
-}
 /*
 * Create an inode and return with a reference already taken, but unlocked
 * This is how we create quota inodes
@@ -1305,8 +1222,8 @@ xfs_qm_qino_alloc(
                return error;
        }
-        if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0,
+        error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
-                                   &xfs_zerocr, 0, 1, ip, &committed))) {
+        if (error) {
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
                                 XFS_TRANS_ABORT);
                return error;
@@ -1516,7 +1433,7 @@ xfs_qm_dqiterate(
                                rablkcnt =  map[i+1].br_blockcount;
                                rablkno = map[i+1].br_startblock;
                                while (rablkcnt--) {
-                                        xfs_baread(mp->m_ddev_targp,
+                                        xfs_buf_readahead(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
                                               mp->m_quotainfo->qi_dqchunklen);
                                        rablkno++;
@@ -1546,18 +1463,34 @@ xfs_qm_dqiterate(
 /*
 * Called by dqusage_adjust in doing a quotacheck.
- * Given the inode, and a dquot (either USR or GRP, doesn't matter),
+ *
- * this updates its incore copy as well as the buffer copy. This is
+ * Given the inode, and a dquot id this updates both the incore dqout as well
- * so that once the quotacheck is done, we can just log all the buffers,
+ * as the buffer copy. This is so that once the quotacheck is done, we can
- * as opposed to logging numerous updates to individual dquots.
+ * just log all the buffers, as opposed to logging numerous updates to
+ * individual dquots.
 */
-STATIC void
+STATIC int
 xfs_qm_quotacheck_dqadjust(
-        xfs_dquot_t             *dqp,
+        struct xfs_inode        *ip,
+        xfs_dqid_t              id,
+        uint                    type,
        xfs_qcnt_t              nblks,
        xfs_qcnt_t              rtblks)
 {
-        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_dquot        *dqp;
+        int                     error;
+        error = xfs_qm_dqget(mp, ip, id, type,
+                             XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
+        if (error) {
+                /*
+                 * Shouldn't be able to turn off quotas here.
+                 */
+                ASSERT(error != ESRCH);
+                ASSERT(error != ENOENT);
+                return error;
+        }
        trace_xfs_dqadjust(dqp);
@@ -1582,11 +1515,13 @@ xfs_qm_quotacheck_dqadjust(
         * There are no timers for the default values set in the root dquot.
         */
        if (dqp->q_core.d_id) {
-                xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
+                xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
-                xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
+                xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
        }
        dqp->dq_flags |= XFS_DQ_DIRTY;
+        xfs_qm_dqput(dqp);
+        return 0;
 }
 STATIC int
@@ -1629,8 +1564,7 @@ xfs_qm_dqusage_adjust(
        int             *res)           /* result code value */
 {
        xfs_inode_t     *ip;
-        xfs_dquot_t     *udqp, *gdqp;
+        xfs_qcnt_t      nblks, rtblks = 0;
-        xfs_qcnt_t      nblks, rtblks;
        int             error;
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1650,51 +1584,24 @@ xfs_qm_dqusage_adjust(
         * the case in all other instances. It's OK that we do this because
         * quotacheck is done only at mount time.
         */
-        if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) {
+        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+        if (error) {
                *res = BULKSTAT_RV_NOTHING;
                return error;
        }
-        /*
+        ASSERT(ip->i_delayed_blks == 0);
-         * Obtain the locked dquots. In case of an error (eg. allocation
-         * fails for ENOSPC), we return the negative of the error number
-         * to bulkstat, so that it can get propagated to quotacheck() and
-         * making us disable quotas for the file system.
-         */
-        if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                IRELE(ip);
-                *res = BULKSTAT_RV_GIVEUP;
-                return error;
-        }
-        rtblks = 0;
+        if (XFS_IS_REALTIME_INODE(ip)) {
-        if (! XFS_IS_REALTIME_INODE(ip)) {
-                nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
-        } else {
                /*
                 * Walk thru the extent list and count the realtime blocks.
                 */
-                if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
+                error = xfs_qm_get_rtblks(ip, &rtblks);
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error)
-                        IRELE(ip);
+                        goto error0;
-                        if (udqp)
-                                xfs_qm_dqput(udqp);
-                        if (gdqp)
-                                xfs_qm_dqput(gdqp);
-                        *res = BULKSTAT_RV_GIVEUP;
-                        return error;
-                }
-                nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
        }
-        ASSERT(ip->i_delayed_blks == 0);
-        /*
+        nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
-         * We can't release the inode while holding its dquot locks.
-         * The inode can go into inactive and might try to acquire the dquotlocks.
-         * So, just unlock here and do a vn_rele at the end.
-         */
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        /*
         * Add the (disk blocks and inode) resources occupied by this
@@ -1709,26 +1616,36 @@ xfs_qm_dqusage_adjust(
         * and quotaoffs don't race. (Quotachecks happen at mount time only).
         */
        if (XFS_IS_UQUOTA_ON(mp)) {
-                ASSERT(udqp);
+                error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
-                xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
+                                                   XFS_DQ_USER, nblks, rtblks);
-                xfs_qm_dqput(udqp);
+                if (error)
+                        goto error0;
        }
-        if (XFS_IS_OQUOTA_ON(mp)) {
-                ASSERT(gdqp);
+        if (XFS_IS_GQUOTA_ON(mp)) {
-                xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
+                error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
-                xfs_qm_dqput(gdqp);
+                                                   XFS_DQ_GROUP, nblks, rtblks);
+                if (error)
+                        goto error0;
        }
-        /*
-         * Now release the inode. This will send it to 'inactive', and
-         * possibly even free blocks.
-         */
-        IRELE(ip);
-        /*
+        if (XFS_IS_PQUOTA_ON(mp)) {
-         * Goto next inode.
+                error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
-         */
+                                                   XFS_DQ_PROJ, nblks, rtblks);
+                if (error)
+                        goto error0;
+        }
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        IRELE(ip);
        *res = BULKSTAT_RV_DIDONE;
        return 0;
+error0:
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        IRELE(ip);
+        *res = BULKSTAT_RV_GIVEUP;
+        return error;
 }
 /*
@@ -1946,12 +1863,14 @@ xfs_qm_dqreclaim_one(void)
        xfs_dquot_t     *dqpout;
        xfs_dquot_t     *dqp;
        int             restarts;
+        int             startagain;
        restarts = 0;
        dqpout = NULL;
        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-startagain:
+again:
+        startagain = 0;
        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
        list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1968,13 +1887,10 @@ startagain:
                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
                        trace_xfs_dqreclaim_want(dqp);
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto startagain;
+                        restarts++;
+                        startagain = 1;
+                        goto dqunlock;
                }
                /*
@@ -1989,23 +1905,20 @@ startagain:
                        ASSERT(list_empty(&dqp->q_mplist));
                        list_del_init(&dqp->q_freelist);
                        xfs_Gqm->qm_dqfrlist_cnt--;
-                        xfs_dqunlock(dqp);
                        dqpout = dqp;
                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        break;
+                        goto dqunlock;
                }
                ASSERT(dqp->q_hash);
                ASSERT(!list_empty(&dqp->q_mplist));
                /*
-                 * Try to grab the flush lock. If this dquot is in the process of
+                 * Try to grab the flush lock. If this dquot is in the process
-                 * getting flushed to disk, we don't want to reclaim it.
+                 * of getting flushed to disk, we don't want to reclaim it.
                 */
-                if (!xfs_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp))
-                        xfs_dqunlock(dqp);
+                        goto dqunlock;
-                        continue;
-                }
                /*
                 * We have the flush lock so we know that this is not in the
@@ -2027,8 +1940,7 @@ startagain:
                                xfs_fs_cmn_err(CE_WARN, mp,
                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
                        }
-                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+                        goto dqunlock;
-                        continue;
                }
                /*
@@ -2050,13 +1962,8 @@ startagain:
                 */
                if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
                        restarts++;
-                        mutex_unlock(&dqp->q_hash->qh_lock);
+                        startagain = 1;
-                        xfs_dqfunlock(dqp);
+                        goto qhunlock;
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
-                        goto startagain;
                }
                ASSERT(dqp->q_nrefs == 0);
@@ -2069,14 +1976,20 @@ startagain:
                xfs_Gqm->qm_dqfrlist_cnt--;
                dqpout = dqp;
                mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+qhunlock:
                mutex_unlock(&dqp->q_hash->qh_lock);
 dqfunlock:
                xfs_dqfunlock(dqp);
+dqunlock:
                xfs_dqunlock(dqp);
                if (dqpout)
                        break;
                if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                        return NULL;
+                        break;
+                if (startagain) {
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+                        goto again;
+                }
        }
        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
        return dqpout;
@@ -2224,7 +2137,7 @@ xfs_qm_write_sb_changes(
 /*
- * Given an inode, a uid and gid (from cred_t) make sure that we have
+ * Given an inode, a uid, gid and prid make sure that we have
 * allocated relevant dquot(s) on disk, and that we won't exceed inode
 * quotas by creating this file.
 * This also attaches dquot(s) to the given inode after locking it,
@@ -2332,7 +2245,7 @@ xfs_qm_vop_dqalloc(
                        xfs_dqunlock(gq);
                }
        } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
-                if (ip->i_d.di_projid != prid) {
+                if (xfs_get_projid(ip) != prid) {
                        xfs_iunlock(ip, lockflags);
                        if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
                                                 XFS_DQ_PROJ,
@@ -2454,7 +2367,7 @@ xfs_qm_vop_chown_reserve(
        }
        if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
                if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
-                     ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id))
+                     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
                        prjflags = XFS_QMOPT_ENOSPC;
                if (prjflags ||
@@ -2558,7 +2471,7 @@ xfs_qm_vop_create_dqattach(
                ip->i_gdquot = gdqp;
                ASSERT(XFS_IS_OQUOTA_ON(mp));
                ASSERT((XFS_IS_GQUOTA_ON(mp) ?
-                        ip->i_d.di_gid : ip->i_d.di_projid) ==
+                        ip->i_d.di_gid : xfs_get_projid(ip)) ==
                                be32_to_cpu(gdqp->q_core.d_id));
                xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
        }
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bea02d786c5d..45b5cb1788ab 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -81,7 +81,7 @@ xfs_qm_statvfs(
        xfs_mount_t             *mp = ip->i_mount;
        xfs_dquot_t             *dqp;
-        if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) {
+        if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
                xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
                xfs_qm_dqput(dqp);
        }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 45e5849df238..bdebc183223e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -276,7 +276,7 @@ xfs_qm_scall_trunc_qfile(
                goto out_unlock;
        }
-        xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 out_unlock:
@@ -875,21 +875,14 @@ xfs_dqrele_inode(
        struct xfs_perag        *pag,
        int                     flags)
 {
-        int                     error;
        /* skip quota inodes */
        if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
            ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
                ASSERT(ip->i_udquot == NULL);
                ASSERT(ip->i_gdquot == NULL);
-                read_unlock(&pag->pag_ici_lock);
                return 0;
        }
-        error = xfs_sync_inode_valid(ip, pag);
-        if (error)
-                return error;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
                xfs_qm_dqrele(ip->i_udquot);
@@ -900,8 +893,6 @@ xfs_dqrele_inode(
                ip->i_gdquot = NULL;
        }
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        IRELE(ip);
        return 0;
 }
@@ -918,8 +909,7 @@ xfs_qm_dqrele_all_inodes(
        uint             flags)
 {
        ASSERT(mp->m_quotainfo);
-        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
-                                XFS_ICI_NO_TAG, 0, NULL);
 }
 /*------------------------------------------------------------------------*/
@@ -1175,7 +1165,7 @@ xfs_qm_internalqcheck_adjust(
        }
        xfs_qm_internalqcheck_get_dquots(mp,
                                        (xfs_dqid_t) ip->i_d.di_uid,
-                                        (xfs_dqid_t) ip->i_d.di_projid,
+                                        (xfs_dqid_t) xfs_get_projid(ip),
                                        (xfs_dqid_t) ip->i_d.di_gid,
                                        &ud, &gd);
        if (XFS_IS_UQUOTA_ON(mp)) {
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a47..0df88897ef84 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,86 +25,78 @@
 #include "xfs_mount.h"
 #include "xfs_error.h"
-static char             message[1024];  /* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL       7
-#define XFS_ERR_MASK            ((1 << 3) - 1)
-static const char * const       err_level[XFS_MAX_ERR_LEVEL+1] =
-                                        {KERN_EMERG, KERN_ALERT, KERN_CRIT,
-                                         KERN_ERR, KERN_WARNING, KERN_NOTICE,
-                                         KERN_INFO, KERN_DEBUG};
 void
-cmn_err(register int level, char *fmt, ...)
+cmn_err(
+        const char      *lvl,
+        const char      *fmt,
+        ...)
 {
-        char    *fp = fmt;
+        struct va_format vaf;
-        int     len;
+        va_list         args;
-        ulong   flags;
-        va_list ap;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
-        level &= XFS_ERR_MASK;
+        vaf.va = &args;
-        if (level > XFS_MAX_ERR_LEVEL)
-                level = XFS_MAX_ERR_LEVEL;
+        printk("%s%pV", lvl, &vaf);
-        spin_lock_irqsave(&xfs_err_lock,flags);
+        va_end(args);
-        va_start(ap, fmt);
-        if (*fmt == '!') fp++;
+        BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-        len = vsnprintf(message, sizeof(message), fp, ap);
-        if (len >= sizeof(message))
-                len = sizeof(message) - 1;
-        if (message[len-1] == '\n')
-                message[len-1] = 0;
-        printk("%s%s\n", err_level[level], message);
-        va_end(ap);
-        spin_unlock_irqrestore(&xfs_err_lock,flags);
-        BUG_ON(level == CE_PANIC);
 }
 void
-xfs_fs_vcmn_err(
+xfs_fs_cmn_err(
-        int                     level,
+        const char              *lvl,
        struct xfs_mount        *mp,
-        char                    *fmt,
+        const char              *fmt,
-        va_list                 ap)
+        ...)
 {
-        unsigned long           flags;
+        struct va_format        vaf;
-        int                     len = 0;
+        va_list                 args;
-        level &= XFS_ERR_MASK;
+        va_start(args, fmt);
-        if (level > XFS_MAX_ERR_LEVEL)
+        vaf.fmt = fmt;
-                level = XFS_MAX_ERR_LEVEL;
+        vaf.va = &args;
-        spin_lock_irqsave(&xfs_err_lock,flags);
+        printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
+        va_end(args);
-        if (mp) {
+        BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-                len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+}
+/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
+void
+xfs_cmn_err(
+        int                     panic_tag,
+        const char              *lvl,
+        struct xfs_mount        *mp,
+        const char              *fmt,
+        ...)
+{
+        struct va_format        vaf;
+        va_list                 args;
+        int                     do_panic = 0;
-                /*
+        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-                 * Skip the printk if we can't print anything useful
+                printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
-                 * due to an over-long device name.
+                do_panic = 1;
-                 */
-                if (len >= sizeof(message))
-                        goto out;
        }
-        len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
+        va_start(args, fmt);
-        if (len >= sizeof(message))
+        vaf.fmt = fmt;
-                len = sizeof(message) - 1;
+        vaf.va = &args;
-        if (message[len-1] == '\n')
-                message[len-1] = 0;
-        printk("%s%s\n", err_level[level], message);
+        printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
- out:
+        va_end(args);
-        spin_unlock_irqrestore(&xfs_err_lock,flags);
-        BUG_ON(level == CE_PANIC);
+        BUG_ON(do_panic);
 }
 void
 assfail(char *expr, char *file, int line)
 {
-        printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
+        printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
+               file, line);
        BUG();
 }
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4f..05699f67d475 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
 #include <stdarg.h>
-#define CE_DEBUG        7               /* debug        */
+struct xfs_mount;
-#define CE_CONT         6               /* continuation */
-#define CE_NOTE         5               /* notice       */
+#define CE_DEBUG        KERN_DEBUG
-#define CE_WARN         4               /* warning      */
+#define CE_CONT         KERN_INFO
-#define CE_ALERT        1               /* alert        */
+#define CE_NOTE         KERN_NOTICE
-#define CE_PANIC        0               /* panic        */
+#define CE_WARN         KERN_WARNING
+#define CE_ALERT        KERN_ALERT
-extern void cmn_err(int, char *, ...)
+#define CE_PANIC        KERN_EMERG
-        __attribute__ ((format (printf, 2, 3)));
+void cmn_err(const char *lvl, const char *fmt, ...)
+                __attribute__ ((format (printf, 2, 3)));
+void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
+                const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
+                const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
 extern void assfail(char *expr, char *f, int l);
 #define ASSERT_ALWAYS(expr)     \
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d7..11dd72070cbb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
 #ifdef CONFIG_XFS_POSIX_ACL
-extern int xfs_check_acl(struct inode *inode, int mask);
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4eed4ed..58632cc17f2d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,9 +227,18 @@ typedef struct xfs_perag {
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        rwlock_t        pag_ici_lock;   /* incore inode lock */
+        spinlock_t      pag_ici_lock;   /* incore inode cache lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
+        struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
+        unsigned long   pag_ici_reclaim_cursor; /* reclaim restart point */
+        /* buffer cache index */
+        spinlock_t      pag_buf_lock;   /* lock for pag_buf_tree */
+        struct rb_root  pag_buf_tree;   /* ordered tree of active buffers */
+        /* for rcu-safe freeing */
+        struct rcu_head rcu_head;
 #endif
        int             pagb_count;     /* pagb slots in use */
 } xfs_perag_t;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index af168faccc7a..f3227984a9bf 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-static int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-                    xfs_agblock_t bno, xfs_extlen_t len);
 /*
 * Prototypes for per-ag allocation routines
 */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
 * Lookup the first record less than or equal to [bno, len]
 * in the btree given by cur.
 */
-STATIC int                              /* error */
+int                                     /* error */
 xfs_alloc_lookup_le(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           bno,    /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
 /*
 * Get the data from the pointed-to record.
 */
-STATIC int                              /* error */
+int                                     /* error */
 xfs_alloc_get_rec(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           *bno,   /* output: starting block of extent */
@@ -577,61 +573,58 @@ xfs_alloc_ag_vextent_exact(
        xfs_extlen_t    rlen;   /* length of returned extent */
        ASSERT(args->alignment == 1);
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO);
+                                          args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
         * if any free block does.
         */
-        if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+        error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+        if (error)
                goto error0;
-        if (!i) {
+        if (!i)
-                /*
+                goto not_found;
-                 * Didn't find it, return null.
-                 */
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * Grab the freespace record.
         */
-        if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+        if (error)
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
        minend = args->agbno + args->minlen;
        maxend = args->agbno + args->maxlen;
        fend = fbno + flen;
        /*
         * Give up if the freespace isn't long enough for the minimum request.
         */
-        if (fend < minend) {
+        if (fend < minend)
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * End of extent will be smaller of the freespace end and the
         * maximal requested end.
-         */
+         *
-        end = XFS_AGBLOCK_MIN(fend, maxend);
-        /*
         * Fix the length according to mod and prod if given.
         */
+        end = XFS_AGBLOCK_MIN(fend, maxend);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
-        if (!xfs_alloc_fix_minleft(args)) {
+        if (!xfs_alloc_fix_minleft(args))
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                return 0;
-        }
        rlen = args->len;
        ASSERT(args->agbno + rlen <= fend);
        end = args->agbno + rlen;
        /*
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +633,25 @@ xfs_alloc_ag_vextent_exact(
                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+        error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
-                        args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+                                      args->len, XFSA_FIXUP_BNO_OK);
+        if (error) {
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
                goto error0;
        }
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-        trace_xfs_alloc_exact_done(args);
        args->wasfromfl = 0;
+        trace_xfs_alloc_exact_done(args);
+        return 0;
+not_found:
+        /* Didn't find it, return null. */
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        args->agbno = NULLAGBLOCK;
+        trace_xfs_alloc_exact_notfound(args);
        return 0;
 error0:
@@ -659,6 +661,95 @@ error0:
 }
 /*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+        struct xfs_alloc_arg    *args,  /* allocation argument structure */
+        struct xfs_btree_cur    **gcur, /* good cursor */
+        struct xfs_btree_cur    **scur, /* searching cursor */
+        xfs_agblock_t           gdiff,  /* difference for search comparison */
+        xfs_agblock_t           *sbno,  /* extent found by search */
+        xfs_extlen_t            *slen,
+        xfs_extlen_t            *slena, /* aligned length */
+        int                     dir)    /* 0 = search right, 1 = search left */
+{
+        xfs_agblock_t           bno;
+        xfs_agblock_t           new;
+        xfs_agblock_t           sdiff;
+        int                     error;
+        int                     i;
+        /* The good extent is perfect, no need to  search. */
+        if (!gdiff)
+                goto out_use_good;
+        /*
+         * Look until we find a better one, run out of space or run off the end.
+         */
+        do {
+                error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
+                                          args->minlen, &bno, slena);
+                /*
+                 * The good extent is closer than this one.
+                 */
+                if (!dir) {
+                        if (bno >= args->agbno + gdiff)
+                                goto out_use_good;
+                } else {
+                        if (bno <= args->agbno - gdiff)
+                                goto out_use_good;
+                }
+                /*
+                 * Same distance, compare length and pick the best.
+                 */
+                if (*slena >= args->minlen) {
+                        args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                                       args->alignment, *sbno,
+                                                       *slen, &new);
+                        /*
+                         * Choose closer size and invalidate other cursor.
+                         */
+                        if (sdiff < gdiff)
+                                goto out_use_search;
+                        goto out_use_good;
+                }
+                if (!dir)
+                        error = xfs_btree_increment(*scur, 0, &i);
+                else
+                        error = xfs_btree_decrement(*scur, 0, &i);
+                if (error)
+                        goto error0;
+        } while (i);
+out_use_good:
+        xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+        *scur = NULL;
+        return 0;
+out_use_search:
+        xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+        *gcur = NULL;
+        return 0;
+error0:
+        /* caller invalidates cursors */
+        return error;
+}
+/*
 * Allocate a variable extent near bno in the allocation group agno.
 * Extent's length (returned in len) will be between minlen and maxlen,
 * and of the form k * prod + mod unless there's nothing that large.
@@ -675,7 +766,7 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   gtbnoa;         /* aligned ... */
        xfs_extlen_t    gtdiff;         /* difference to right side entry */
        xfs_extlen_t    gtlen;          /* length of right side entry */
-        xfs_extlen_t    gtlena;         /* aligned ... */
+        xfs_extlen_t    gtlena = 0;     /* aligned ... */
        xfs_agblock_t   gtnew;          /* useful start bno of right side */
        int             error;          /* error code */
        int             i;              /* result code, temporary */
@@ -684,7 +775,7 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   ltbnoa;         /* aligned ... */
        xfs_extlen_t    ltdiff;         /* difference to left side entry */
        xfs_extlen_t    ltlen;          /* length of left side entry */
-        xfs_extlen_t    ltlena;         /* aligned ... */
+        xfs_extlen_t    ltlena = 0;     /* aligned ... */
        xfs_agblock_t   ltnew;          /* useful start bno of left side */
        xfs_extlen_t    rlen;           /* length of returned extent */
 #if defined(DEBUG) && defined(__KERNEL__)
@@ -925,203 +1016,45 @@ xfs_alloc_ag_vextent_near(
                        }
                }
        } while (bno_cur_lt || bno_cur_gt);
        /*
         * Got both cursors still active, need to find better entry.
         */
        if (bno_cur_lt && bno_cur_gt) {
-                /*
-                 * Left side is long enough, look for a right side entry.
-                 */
                if (ltlena >= args->minlen) {
                        /*
-                         * Fix up the length.
+                         * Left side is good, look for a right side entry.
                         */
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, ltbno, ltlen, &ltnew);
+                        error = xfs_alloc_find_best_extent(args,
+                                                &bno_cur_lt, &bno_cur_gt,
+                                                ltdiff, &gtbno, &gtlen, &gtlena,
+                                                0 /* search right */);
+                } else {
+                        ASSERT(gtlena >= args->minlen);
                        /*
-                         * Not perfect.
+                         * Right side is good, look for a left side entry.
-                         */
-                        if (ltdiff) {
-                                /*
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_gt, &gtbno,
-                                                        &gtlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(gtbno, gtlen,
-                                                args->alignment, args->minlen,
-                                                &gtbnoa, &gtlena);
-                                        /*
-                                         * The left one is clearly better.
-                                         */
-                                        if (gtbnoa >= args->agbno + ltdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (gtlena >= args->minlen) {
-                                                args->len =
-                                                        XFS_EXTLEN_MIN(gtlena,
-                                                                args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                gtdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        gtbno, gtlen, &gtnew);
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                if (gtdiff < ltdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the right end.
-                                         */
-                                        if ((error = xfs_btree_increment(
-                                                        bno_cur_gt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The left side is perfect, trash the right side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_gt,
-                                                     XFS_BTREE_NOERROR);
-                                bno_cur_gt = NULL;
-                        }
-                }
-                /*
-                 * It's the right side that was found first, look left.
-                 */
-                else {
-                        /*
-                         * Fix up the length.
                         */
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, gtbno, gtlen, &gtnew);
-                        /*
-                         * Right side entry isn't perfect.
+                        error = xfs_alloc_find_best_extent(args,
-                         */
+                                                &bno_cur_gt, &bno_cur_lt,
-                        if (gtdiff) {
+                                                gtdiff, &ltbno, &ltlen, &ltlena,
-                                /*
+                                                1 /* search left */);
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_lt, &ltbno,
-                                                        &ltlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(ltbno, ltlen,
-                                                args->alignment, args->minlen,
-                                                &ltbnoa, &ltlena);
-                                        /*
-                                         * The right one is clearly better.
-                                         */
-                                        if (ltbnoa <= args->agbno - gtdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (ltlena >= args->minlen) {
-                                                args->len = XFS_EXTLEN_MIN(
-                                                        ltlena, args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                ltdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        ltbno, ltlen, &ltnew);
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                if (ltdiff < gtdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the left end.
-                                         */
-                                        if ((error = xfs_btree_decrement(
-                                                        bno_cur_lt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The right side is perfect, trash the left side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_lt,
-                                        XFS_BTREE_NOERROR);
-                                bno_cur_lt = NULL;
-                        }
                }
+                if (error)
+                        goto error0;
        }
        /*
         * If we couldn't get anything, give up.
         */
@@ -1130,6 +1063,7 @@ xfs_alloc_ag_vextent_near(
                args->agbno = NULLAGBLOCK;
                return 0;
        }
        /*
         * At this point we have selected a freespace entry, either to the
         * left or to the right.  If it's on the right, copy all the
@@ -1146,6 +1080,7 @@ xfs_alloc_ag_vextent_near(
                j = 1;
        } else
                j = 0;
        /*
         * Fix up the length and compute the useful address.
         */
@@ -2676,7 +2611,7 @@ restart:
 * will require a synchronous transaction, but it can still be
 * used to distinguish between a partial or exact match.
 */
-static int
+int
 xfs_alloc_busy_search(
        struct xfs_mount        *mp,
        xfs_agnumber_t          agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..d0b3bc72005b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
 #define __XFS_ALLOC_H__
 struct xfs_buf;
+struct xfs_btree_cur;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
@@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
 #define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
 /*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *      - the AG superblock, AGF, AGI and AGFL
+ *      - the AGF (bno and cnt) and AGI btree root blocks
+ *      - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)     \
+        ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+/*
 * Argument structure for xfs_alloc routines.
 * This is turned into a structure to avoid having 20 arguments passed
 * down several levels of the stack.
@@ -118,16 +135,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
                struct xfs_perag *pag);
 #ifdef __KERNEL__
 void
-xfs_alloc_busy_insert(xfs_trans_t *tp,
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
-                xfs_agnumber_t agno,
+        xfs_agblock_t bno, xfs_extlen_t len);
-                xfs_agblock_t bno,
-                xfs_extlen_t len);
 void
 xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+        xfs_agblock_t bno, xfs_extlen_t len);
 #endif  /* __KERNEL__ */
 /*
@@ -205,4 +222,18 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
+int                                     /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
+int                                     /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat); /* output: success/failure */
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 97f7328967fd..3916925e2584 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -280,38 +280,6 @@ xfs_allocbt_key_diff(
        return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
-STATIC int
-xfs_allocbt_kill_root(
-        struct xfs_btree_cur    *cur,
-        struct xfs_buf          *bp,
-        int                     level,
-        union xfs_btree_ptr     *newroot)
-{
-        int                     error;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-        XFS_BTREE_STATS_INC(cur, killroot);
-        /*
-         * Update the root pointer, decreasing the level by 1 and then
-         * free the old root.
-         */
-        xfs_allocbt_set_root(cur, newroot, -1);
-        error = xfs_allocbt_free_block(cur, bp);
-        if (error) {
-                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                return error;
-        }
-        XFS_BTREE_STATS_INC(cur, free);
-        xfs_btree_setbuf(cur, level, NULL);
-        cur->bc_nlevels--;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        return 0;
-}
 #ifdef DEBUG
 STATIC int
 xfs_allocbt_keys_inorder(
@@ -423,7 +391,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
        .dup_cursor             = xfs_allocbt_dup_cursor,
        .set_root               = xfs_allocbt_set_root,
-        .kill_root              = xfs_allocbt_kill_root,
        .alloc_block            = xfs_allocbt_alloc_block,
        .free_block             = xfs_allocbt_free_block,
        .update_lastrec         = xfs_allocbt_update_lastrec,
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index c2568242a901..c86375378810 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -355,16 +355,15 @@ xfs_attr_set_int(
                        if (mp->m_flags & XFS_MOUNT_WSYNC) {
                                xfs_trans_set_sync(args.trans);
                        }
+                        if (!error && (flags & ATTR_KERNOTIME) == 0) {
+                                xfs_trans_ichgtime(args.trans, dp,
+                                                        XFS_ICHGTIME_CHG);
+                        }
                        err2 = xfs_trans_commit(args.trans,
                                                 XFS_TRANS_RELEASE_LOG_RES);
                        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-                        /*
-                         * Hit the inode change time.
-                         */
-                        if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-                        }
                        return(error == 0 ? err2 : error);
                }
@@ -420,6 +419,9 @@ xfs_attr_set_int(
                xfs_trans_set_sync(args.trans);
        }
+        if ((flags & ATTR_KERNOTIME) == 0)
+                xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
        /*
         * Commit the last in the sequence of transactions.
         */
@@ -427,13 +429,6 @@ xfs_attr_set_int(
        error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-        /*
-         * Hit the inode change time.
-         */
-        if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-        }
        return(error);
 out:
@@ -567,6 +562,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
                xfs_trans_set_sync(args.trans);
        }
+        if ((flags & ATTR_KERNOTIME) == 0)
+                xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
        /*
         * Commit the last in the sequence of transactions.
         */
@@ -574,13 +572,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
        error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-        /*
-         * Hit the inode change time.
-         */
-        if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-        }
        return(error);
 out:
@@ -1995,7 +1986,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                        tmp = (valuelen < XFS_BUF_SIZE(bp))
                                ? valuelen : XFS_BUF_SIZE(bp);
-                        xfs_biomove(bp, 0, tmp, dst, XBF_READ);
+                        xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
                        xfs_buf_relse(bp);
                        dst += tmp;
                        valuelen -= tmp;
@@ -2125,9 +2116,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
                                                        XFS_BUF_SIZE(bp);
-                xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
+                xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
                if (tmp < XFS_BUF_SIZE(bp))
-                        xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+                        xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
                if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
                        return (error);
                }
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb6..71e90dc2aeb1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * It didn't all fit, so we have to sort everything on hashval.
         */
        sbsize = sf->hdr.count * sizeof(*sbuf);
-        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
        /*
         * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                args.dp = context->dp;
                                args.whichfork = XFS_ATTR_FORK;
                                args.valuelen = valuelen;
-                                args.value = kmem_alloc(valuelen, KM_SLEEP);
+                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
                                args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
                                retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f90dadd5a968..dc3afd7739ff 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -614,7 +614,7 @@ xfs_bmap_add_extent(
                        nblks += cur->bc_private.b.allocated;
                ASSERT(nblks <= da_old);
                if (nblks < da_old)
-                        xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+                        xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
                                (int64_t)(da_old - nblks), rsvd);
        }
        /*
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the middle part of a previous delayed allocation.
                 * Contiguity is impossible here.
                 * This case is avoided almost all the time.
+                 *
+                 * We start with a delayed allocation:
+                 *
+                 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+                 *  PREV @ idx
+                 *
+                 * and we are allocating:
+                 *                     +rrrrrrrrrrrrrrrrr+
+                 *                            new
+                 *
+                 * and we set it up for insertion as:
+                 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+                 *                            new
+                 *  PREV @ idx          LEFT              RIGHT
+                 *                      inserted at idx + 1
                 */
                temp = new->br_startoff - PREV.br_startoff;
-                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-                xfs_bmbt_set_blockcount(ep, temp);
-                r[0] = *new;
-                r[1].br_state = PREV.br_state;
-                r[1].br_startblock = 0;
-                r[1].br_startoff = new_endoff;
                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-                r[1].br_blockcount = temp2;
+                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-                xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
+                xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
+                LEFT = *new;
+                RIGHT.br_state = PREV.br_state;
+                RIGHT.br_startblock = nullstartblock(
+                                (int)xfs_bmap_worst_indlen(ip, temp2));
+                RIGHT.br_startoff = new_endoff;
+                RIGHT.br_blockcount = temp2;
+                /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+                xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
@@ -1079,7 +1096,8 @@ xfs_bmap_add_extent_delay_real(
                diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
                if (diff > 0 &&
-                    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) {
+                    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+                                             -((int64_t)diff), rsvd)) {
                        /*
                         * Ick gross gag me with a spoon.
                         */
@@ -1089,16 +1107,18 @@ xfs_bmap_add_extent_delay_real(
                                        temp--;
                                        diff--;
                                        if (!diff ||
-                                            !xfs_mod_incore_sb(ip->i_mount,
+                                            !xfs_icsb_modify_counters(ip->i_mount,
-                                                    XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd))
+                                                    XFS_SBS_FDBLOCKS,
+                                                    -((int64_t)diff), rsvd))
                                                break;
                                }
                                if (temp2) {
                                        temp2--;
                                        diff--;
                                        if (!diff ||
-                                            !xfs_mod_incore_sb(ip->i_mount,
+                                            !xfs_icsb_modify_counters(ip->i_mount,
-                                                    XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd))
+                                                    XFS_SBS_FDBLOCKS,
+                                                    -((int64_t)diff), rsvd))
                                                break;
                                }
                        }
@@ -1766,7 +1786,7 @@ xfs_bmap_add_extent_hole_delay(
        }
        if (oldlen != newlen) {
                ASSERT(oldlen > newlen);
-                xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+                xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
                        (int64_t)(oldlen - newlen), rsvd);
                /*
                 * Nothing to do for disk quota accounting here.
@@ -2427,7 +2447,7 @@ xfs_bmap_btalloc_nullfb(
                startag = ag = 0;
        pag = xfs_perag_get(mp, ag);
-        while (*blen < ap->alen) {
+        while (*blen < args->maxlen) {
                if (!pag->pagf_init) {
                        error = xfs_alloc_pagf_init(mp, args->tp, ag,
                                                    XFS_ALLOC_FLAG_TRYLOCK);
@@ -2449,7 +2469,7 @@ xfs_bmap_btalloc_nullfb(
                        notinit = 1;
                if (xfs_inode_is_filestream(ap->ip)) {
-                        if (*blen >= ap->alen)
+                        if (*blen >= args->maxlen)
                                break;
                        if (ap->userdata) {
@@ -2495,14 +2515,14 @@ xfs_bmap_btalloc_nullfb(
         * If the best seen length is less than the request
         * length, use the best as the minimum.
         */
-        else if (*blen < ap->alen)
+        else if (*blen < args->maxlen)
                args->minlen = *blen;
        /*
-         * Otherwise we've seen an extent as big as alen,
+         * Otherwise we've seen an extent as big as maxlen,
         * use that as the minimum.
         */
        else
-                args->minlen = ap->alen;
+                args->minlen = args->maxlen;
        /*
         * set the failure fallback case to look in the selected
@@ -2570,7 +2590,9 @@ xfs_bmap_btalloc(
        args.tp = ap->tp;
        args.mp = mp;
        args.fsbno = ap->rval;
-        args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+        /* Trim the allocation back to the maximum an AG can fit. */
+        args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
        args.firstblock = ap->firstblock;
        blen = 0;
        if (nullfb) {
@@ -2618,7 +2640,7 @@ xfs_bmap_btalloc(
                        /*
                         * Adjust for alignment
                         */
-                        if (blen > args.alignment && blen <= ap->alen)
+                        if (blen > args.alignment && blen <= args.maxlen)
                                args.minlen = blen - args.alignment;
                        args.minalignslop = 0;
                } else {
@@ -2637,7 +2659,7 @@ xfs_bmap_btalloc(
                         * of minlen+alignment+slop doesn't go up
                         * between the calls.
                         */
-                        if (blen > mp->m_dalign && blen <= ap->alen)
+                        if (blen > mp->m_dalign && blen <= args.maxlen)
                                nextminlen = blen - mp->m_dalign;
                        else
                                nextminlen = args.minlen;
@@ -3111,9 +3133,10 @@ xfs_bmap_del_extent(
         * Nothing to do for disk quota accounting here.
         */
        ASSERT(da_old >= da_new);
-        if (da_old > da_new)
+        if (da_old > da_new) {
-                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new),
+                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                        rsvd);
+                        (int64_t)(da_old - da_new), rsvd);
+        }
 done:
        *logflagsp = flags;
        return error;
@@ -4481,6 +4504,16 @@ xfs_bmapi(
                                /* Figure out the extent size, adjust alen */
                                extsz = xfs_get_extsz_hint(ip);
                                if (extsz) {
+                                        /*
+                                         * make sure we don't exceed a single
+                                         * extent length when we align the
+                                         * extent by reducing length we are
+                                         * going to allocate by the maximum
+                                         * amount extent size aligment may
+                                         * require.
+                                         */
+                                        alen = XFS_FILBLKS_MIN(len,
+                                                   MAXEXTLEN - (2 * extsz - 1));
                                        error = xfs_bmap_extsize_align(mp,
                                                        &got, &prev, extsz,
                                                        rt, eof,
@@ -4526,13 +4559,13 @@ xfs_bmapi(
                                                        -((int64_t)extsz), (flags &
                                                        XFS_BMAPI_RSVBLOCKS));
                                } else {
-                                        error = xfs_mod_incore_sb(mp,
+                                        error = xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
                                                        -((int64_t)alen), (flags &
                                                        XFS_BMAPI_RSVBLOCKS));
                                }
                                if (!error) {
-                                        error = xfs_mod_incore_sb(mp,
+                                        error = xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
                                                        -((int64_t)indlen), (flags &
                                                        XFS_BMAPI_RSVBLOCKS));
@@ -4542,7 +4575,7 @@ xfs_bmapi(
                                                        (int64_t)extsz, (flags &
                                                        XFS_BMAPI_RSVBLOCKS));
                                        else if (error)
-                                                xfs_mod_incore_sb(mp,
+                                                xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
                                                        (int64_t)alen, (flags &
                                                        XFS_BMAPI_RSVBLOCKS));
@@ -4744,8 +4777,12 @@ xfs_bmapi(
                 * Check if writing previously allocated but
                 * unwritten extents.
                 */
-                if (wr && mval->br_state == XFS_EXT_UNWRITTEN &&
+                if (wr &&
-                    ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) {
+                    ((mval->br_state == XFS_EXT_UNWRITTEN &&
+                      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
+                     (mval->br_state == XFS_EXT_NORM &&
+                      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
+                                (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
                        /*
                         * Modify (by adding) the state flag, if writing.
                         */
@@ -4757,7 +4794,9 @@ xfs_bmapi(
                                        *firstblock;
                                cur->bc_private.b.flist = flist;
                        }
-                        mval->br_state = XFS_EXT_NORM;
+                        mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+                                                ? XFS_EXT_NORM
+                                                : XFS_EXT_UNWRITTEN;
                        error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
                                firstblock, flist, &tmp_logflags,
                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
@@ -5200,7 +5239,7 @@ xfs_bunmapi(
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_RTBLKS);
                        } else {
-                                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
+                                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
                                                (int64_t)del.br_blockcount, rsvd);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
@@ -5461,8 +5500,13 @@ xfs_getbmap(
                        if (error)
                                goto out_unlock_iolock;
                }
+                /*
-                ASSERT(ip->i_delayed_blks == 0);
+                 * even after flushing the inode, there can still be delalloc
+                 * blocks on the inode beyond EOF due to speculative
+                 * preallocation. These are not removed until the release
+                 * function is called or the inode is inactivated. Hence we
+                 * cannot assert here that ip->i_delayed_blks == 0.
+                 */
        }
        lock = xfs_ilock_map_shared(ip);
@@ -6060,3 +6104,79 @@ xfs_bmap_disk_count_leaves(
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length)
+{
+        xfs_fileoff_t           remaining = length;
+        int                     error = 0;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        do {
+                int             done;
+                xfs_bmbt_irec_t imap;
+                int             nimaps = 1;
+                xfs_fsblock_t   firstblock;
+                xfs_bmap_free_t flist;
+                /*
+                 * Map the range first and check that it is a delalloc extent
+                 * before trying to unmap the range. Otherwise we will be
+                 * trying to remove a real extent (which requires a
+                 * transaction) or a hole, which is probably a bad idea...
+                 */
+                error = xfs_bmapi(NULL, ip, start_fsb, 1,
+                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+                                &nimaps, NULL);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
+                                                ip->i_ino, start_fsb);
+                        }
+                        break;
+                }
+                if (!nimaps) {
+                        /* nothing there */
+                        goto next_block;
+                }
+                if (imap.br_startblock != DELAYSTARTBLOCK) {
+                        /* been converted, ignore */
+                        goto next_block;
+                }
+                WARN_ON(imap.br_blockcount == 0);
+                /*
+                 * Note: while we initialise the firstblock/flist pair, they
+                 * should never be used because blocks should never be
+                 * allocated or freed for a delalloc extent and hence we need
+                 * don't cancel or finish them after the xfs_bunmapi() call.
+                 */
+                xfs_bmap_init(&flist, &firstblock);
+                error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+                                        &flist, &done);
+                if (error)
+                        break;
+                ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+                start_fsb++;
+                remaining--;
+        } while(remaining > 0);
+        return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index b13569a6179b..3651191daea1 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -74,9 +74,12 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAPI_IGSTATE       0x080   /* Ignore state - */
                                        /* combine contig. space */
 #define XFS_BMAPI_CONTIG        0x100   /* must allocate only one extent */
-#define XFS_BMAPI_CONVERT       0x200   /* unwritten extent conversion - */
+/*
-                                        /* need write cache flushing and no */
+ * unwritten extent conversion - this needs write cache flushing and no additional
-                                        /* additional allocation alignments */
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT       0x200
 #define XFS_BMAPI_FLAGS \
        { XFS_BMAPI_WRITE,      "WRITE" }, \
@@ -391,6 +394,11 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 829af92f0fba..2f9e97c128a0 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -217,7 +217,7 @@ xfs_btree_del_cursor(
         */
        for (i = 0; i < cur->bc_nlevels; i++) {
                if (cur->bc_bufs[i])
-                        xfs_btree_setbuf(cur, i, NULL);
+                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
                else if (!error)
                        break;
        }
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
                return error;
        }
        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-        if (bp != NULL) {
+        if (bp)
                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-        }
        *bpp = bp;
        return 0;
 }
@@ -656,7 +655,7 @@ xfs_btree_reada_bufl(
        ASSERT(fsbno != NULLFSBLOCK);
        d = XFS_FSB_TO_DADDR(mp, fsbno);
-        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 /*
@@ -676,7 +675,7 @@ xfs_btree_reada_bufs(
        ASSERT(agno != NULLAGNUMBER);
        ASSERT(agbno != NULLAGBLOCK);
        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 STATIC int
@@ -763,22 +762,19 @@ xfs_btree_readahead(
 * Set the buffer for level "lev" in the cursor to bp, releasing
 * any previous buffer.
 */
-void
+STATIC void
 xfs_btree_setbuf(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     lev,    /* level in btree */
        xfs_buf_t               *bp)    /* new buffer to set */
 {
        struct xfs_btree_block  *b;     /* btree block */
-        xfs_buf_t               *obp;   /* old buffer pointer */
-        obp = cur->bc_bufs[lev];
+        if (cur->bc_bufs[lev])
-        if (obp)
+                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
-                xfs_trans_brelse(cur->bc_tp, obp);
        cur->bc_bufs[lev] = bp;
        cur->bc_ra[lev] = 0;
-        if (!bp)
-                return;
        b = XFS_BUF_TO_BLOCK(bp);
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
                if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
@@ -947,13 +943,13 @@ xfs_btree_set_refs(
        switch (cur->bc_btnum) {
        case XFS_BTNUM_BNO:
        case XFS_BTNUM_CNT:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
                break;
        case XFS_BTNUM_INO:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
                break;
        case XFS_BTNUM_BMAP:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
                break;
        default:
                ASSERT(0);
@@ -3011,6 +3007,43 @@ out0:
        return 0;
 }
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     level,
+        union xfs_btree_ptr     *newroot)
+{
+        int                     error;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, killroot);
+        /*
+         * Update the root pointer, decreasing the level by 1 and then
+         * free the old root.
+         */
+        cur->bc_ops->set_root(cur, newroot, -1);
+        error = cur->bc_ops->free_block(cur, bp);
+        if (error) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                return error;
+        }
+        XFS_BTREE_STATS_INC(cur, free);
+        cur->bc_bufs[level] = NULL;
+        cur->bc_ra[level] = 0;
+        cur->bc_nlevels--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
 STATIC int
 xfs_btree_dec_cursor(
        struct xfs_btree_cur    *cur,
@@ -3195,7 +3228,7 @@ xfs_btree_delrec(
                         * Make it the new root of the btree.
                         */
                        pp = xfs_btree_ptr_addr(cur, 1, block);
-                        error = cur->bc_ops->kill_root(cur, bp, level, pp);
+                        error = xfs_btree_kill_root(cur, bp, level, pp);
                        if (error)
                                goto error0;
                } else if (level > 0) {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7fa07062bdda..82fafc66bd1f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -152,9 +152,7 @@ struct xfs_btree_ops {
        /* update btree root pointer */
        void    (*set_root)(struct xfs_btree_cur *cur,
-                                union xfs_btree_ptr *nptr, int level_change);
+                            union xfs_btree_ptr *nptr, int level_change);
-        int     (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
-                                int level, union xfs_btree_ptr *newroot);
        /* block allocation / freeing */
        int     (*alloc_block)(struct xfs_btree_cur *cur,
@@ -399,16 +397,6 @@ xfs_btree_reada_bufs(
        xfs_agblock_t           agbno,  /* allocation group block number */
        xfs_extlen_t            count); /* count of filesystem blocks */
-/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
- */
-void
-xfs_btree_setbuf(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     lev,    /* level in btree */
-        struct xfs_buf          *bp);   /* new buffer to set */
 /*
 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1b09d7a280df..6f8c21ce0d6d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,8 +141,7 @@ xfs_buf_item_log_check(
 #define         xfs_buf_item_log_check(x)
 #endif
-STATIC void     xfs_buf_error_relse(xfs_buf_t *bp);
+STATIC void     xfs_buf_do_callbacks(struct xfs_buf *bp);
-STATIC void     xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
 /*
 * This returns the number of log iovecs needed to log the
@@ -428,13 +427,15 @@ xfs_buf_item_unpin(
                if (remove) {
                        /*
-                         * We have to remove the log item from the transaction
+                         * If we are in a transaction context, we have to
-                         * as we are about to release our reference to the
+                         * remove the log item from the transaction as we are
-                         * buffer.  If we don't, the unlock that occurs later
+                         * about to release our reference to the buffer.  If we
-                         * in xfs_trans_uncommit() will ry to reference the
+                         * don't, the unlock that occurs later in
+                         * xfs_trans_uncommit() will try to reference the
                         * buffer which we no longer have a hold on.
                         */
-                        xfs_trans_del_item(lip);
+                        if (lip->li_desc)
+                                xfs_trans_del_item(lip);
                        /*
                         * Since the transaction no longer refers to the buffer,
@@ -450,7 +451,7 @@ xfs_buf_item_unpin(
                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+                        xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
@@ -692,8 +693,7 @@ xfs_buf_item_init(
         * the first.  If we do already have one, there is
         * nothing to do here so return.
         */
-        if (bp->b_mount != mp)
+        ASSERT(bp->b_target->bt_mount == mp);
-                bp->b_mount = mp;
        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                if (lip->li_type == XFS_LI_BUF) {
@@ -919,15 +919,26 @@ xfs_buf_attach_iodone(
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 }
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
 STATIC void
 xfs_buf_do_callbacks(
-        xfs_buf_t       *bp,
+        struct xfs_buf          *bp)
-        xfs_log_item_t  *lip)
 {
-        xfs_log_item_t  *nlip;
+        struct xfs_log_item     *lip;
-        while (lip != NULL) {
+        while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
-                nlip = lip->li_bio_list;
+                XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
                ASSERT(lip->li_cb != NULL);
                /*
                 * Clear the next pointer so we don't have any
@@ -937,7 +948,6 @@ xfs_buf_do_callbacks(
                 */
                lip->li_bio_list = NULL;
                lip->li_cb(bp, lip);
-                lip = nlip;
        }
 }
@@ -950,128 +960,76 @@ xfs_buf_do_callbacks(
 */
 void
 xfs_buf_iodone_callbacks(
-        xfs_buf_t       *bp)
+        struct xfs_buf          *bp)
 {
-        xfs_log_item_t  *lip;
+        struct xfs_log_item     *lip = bp->b_fspriv;
-        static ulong    lasttime;
+        struct xfs_mount        *mp = lip->li_mountp;
-        static xfs_buftarg_t *lasttarg;
+        static ulong            lasttime;
-        xfs_mount_t     *mp;
+        static xfs_buftarg_t    *lasttarg;
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        if (likely(!XFS_BUF_GETERROR(bp)))
-        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                goto do_callbacks;
-        if (XFS_BUF_GETERROR(bp) != 0) {
+        /*
-                /*
+         * If we've already decided to shutdown the filesystem because of
-                 * If we've already decided to shutdown the filesystem
+         * I/O errors, there's no point in giving this a retry.
-                 * because of IO errors, there's no point in giving this
+         */
-                 * a retry.
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                 */
+                XFS_BUF_SUPER_STALE(bp);
-                mp = lip->li_mountp;
+                trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+                goto do_callbacks;
-                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
+        }
-                        XFS_BUF_SUPER_STALE(bp);
-                        trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                        xfs_buf_do_callbacks(bp, lip);
-                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
-                        XFS_BUF_CLR_IODONE_FUNC(bp);
-                        xfs_biodone(bp);
-                        return;
-                }
-                if ((XFS_BUF_TARGET(bp) != lasttarg) ||
+        if (XFS_BUF_TARGET(bp) != lasttarg ||
-                    (time_after(jiffies, (lasttime + 5*HZ)))) {
+            time_after(jiffies, (lasttime + 5*HZ))) {
-                        lasttime = jiffies;
+                lasttime = jiffies;
-                        cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+                cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
-                                        " block 0x%llx in %s",
+                                " block 0x%llx in %s",
-                                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+                        XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-                              (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+                      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
-                }
+        }
-                lasttarg = XFS_BUF_TARGET(bp);
+        lasttarg = XFS_BUF_TARGET(bp);
-                if (XFS_BUF_ISASYNC(bp)) {
+        /*
-                        /*
+         * If the write was asynchronous then noone will be looking for the
-                         * If the write was asynchronous then noone will be
+         * error.  Clear the error state and write the buffer out again.
-                         * looking for the error.  Clear the error state
+         *
-                         * and write the buffer out again delayed write.
+         * During sync or umount we'll write all pending buffers again
-                         *
+         * synchronous, which will catch these errors if they keep hanging
-                         * XXXsup This is OK, so long as we catch these
+         * around.
-                         * before we start the umount; we don't want these
+         */
-                         * DELWRI metadata bufs to be hanging around.
+        if (XFS_BUF_ISASYNC(bp)) {
-                         */
+                XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
-                        XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
+                if (!XFS_BUF_ISSTALE(bp)) {
-                        if (!(XFS_BUF_ISSTALE(bp))) {
+                        XFS_BUF_DELAYWRITE(bp);
-                                XFS_BUF_DELAYWRITE(bp);
-                                XFS_BUF_DONE(bp);
-                                XFS_BUF_SET_START(bp);
-                        }
-                        ASSERT(XFS_BUF_IODONE_FUNC(bp));
-                        trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-                        xfs_buf_relse(bp);
-                } else {
-                        /*
-                         * If the write of the buffer was not asynchronous,
-                         * then we want to make sure to return the error
-                         * to the caller of bwrite().  Because of this we
-                         * cannot clear the B_ERROR state at this point.
-                         * Instead we install a callback function that
-                         * will be called when the buffer is released, and
-                         * that routine will clear the error state and
-                         * set the buffer to be written out again after
-                         * some delay.
-                         */
-                        /* We actually overwrite the existing b-relse
-                           function at times, but we're gonna be shutting down
-                           anyway. */
-                        XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
                        XFS_BUF_DONE(bp);
-                        XFS_BUF_FINISH_IOWAIT(bp);
+                        XFS_BUF_SET_START(bp);
                }
+                ASSERT(XFS_BUF_IODONE_FUNC(bp));
+                trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+                xfs_buf_relse(bp);
                return;
        }
-        xfs_buf_do_callbacks(bp, lip);
+        /*
-        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+         * If the write of the buffer was synchronous, we want to make
-        XFS_BUF_CLR_IODONE_FUNC(bp);
+         * sure to return the error to the caller of xfs_bwrite().
-        xfs_biodone(bp);
+         */
-}
-/*
- * This is a callback routine attached to a buffer which gets an error
- * when being written out synchronously.
- */
-STATIC void
-xfs_buf_error_relse(
-        xfs_buf_t       *bp)
-{
-        xfs_log_item_t  *lip;
-        xfs_mount_t     *mp;
-        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-        mp = (xfs_mount_t *)lip->li_mountp;
-        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
        XFS_BUF_STALE(bp);
        XFS_BUF_DONE(bp);
        XFS_BUF_UNDELAYWRITE(bp);
-        XFS_BUF_ERROR(bp,0);
        trace_xfs_buf_error_relse(bp, _RET_IP_);
+        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-        if (! XFS_FORCED_SHUTDOWN(mp))
+do_callbacks:
-                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+        xfs_buf_do_callbacks(bp);
-        /*
-         * We have to unpin the pinned buffers so do the
-         * callbacks.
-         */
-        xfs_buf_do_callbacks(bp, lip);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
+        xfs_buf_ioend(bp, 0);
-        xfs_buf_relse(bp);
 }
 /*
 * This is the iodone() function for buffers which have been
 * logged.  It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c7..b6ecd2061e7c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
        xfs_buf_log_format_t    bli_format;     /* in-log header */
 } xfs_buf_log_item_t;
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
-        xfs_daddr_t             bc_blkno;
-        uint                    bc_len;
-        int                     bc_refcount;
-        struct xfs_buf_cancel   *bc_next;
-} xfs_buf_cancel_t;
 void    xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void    xfs_buf_item_relse(struct xfs_buf *);
 void    xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 30fa0e206fba..1c00bedb3175 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2042,7 +2042,7 @@ xfs_da_do_buf(
                                mappedbno, nmapped, 0, &bp);
                        break;
                case 3:
-                        xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
+                        xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
                        error = 0;
                        bp = NULL;
                        break;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a22..e60490bc00a6 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
        ip->i_d.di_format = tip->i_d.di_format;
        tip->i_d.di_format = tmp;
+        /*
+         * The extents in the source inode could still contain speculative
+         * preallocation beyond EOF (e.g. the file is open but not modified
+         * while defrag is in progress). In that case, we need to copy over the
+         * number of delalloc blocks the data fork in the source inode is
+         * tracking beyond EOF so that when the fork is truncated away when the
+         * temporary inode is unlinked we don't underrun the i_delayed_blks
+         * counter on that inode.
+         */
+        ASSERT(tip->i_delayed_blks == 0);
+        tip->i_delayed_blks = ip->i_delayed_blks;
+        ip->i_delayed_blks = 0;
        ilf_fields = XFS_ILOG_CORE;
        switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5b153b2e6a3..dffba9ba0db6 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -49,8 +49,9 @@ typedef struct xfs_dinode {
        __be32          di_uid;         /* owner's user id */
        __be32          di_gid;         /* owner's group id */
        __be32          di_nlink;       /* number of links to file */
-        __be16          di_projid;      /* owner's project id */
+        __be16          di_projid_lo;   /* lower part of owner's project id */
-        __u8            di_pad[8];      /* unused, zeroed space */
+        __be16          di_projid_hi;   /* higher part owner's project id */
+        __u8            di_pad[6];      /* unused, zeroed space */
        __be16          di_flushiter;   /* incremented on flush */
        xfs_timestamp_t di_atime;       /* time last accessed */
        xfs_timestamp_t di_mtime;       /* time last modified */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 504be8640e91..ae891223be90 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -961,7 +961,7 @@ xfs_dir2_leaf_getdents(
                                if (i > ra_current &&
                                    map[ra_index].br_blockcount >=
                                    mp->m_dirblkfsbs) {
-                                        xfs_baread(mp->m_ddev_targp,
+                                        xfs_buf_readahead(mp->m_ddev_targp,
                                                XFS_FSB_TO_DADDR(mp,
                                                   map[ra_index].br_startblock +
                                                   ra_offset),
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed9990267661..4c7db74a05f7 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *  xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int     xfs_error_test_active;
 int
 xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
                        len = strlen(mp->m_fsname);
                        xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
                        strcpy(xfs_etest_fsname[i], mp->m_fsname);
+                        xfs_error_test_active++;
                        return 0;
                }
        }
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                        xfs_etest_fsid[i] = 0LL;
                        kmem_free(xfs_etest_fsname[i]);
                        xfs_etest_fsname[i] = NULL;
+                        xfs_error_test_active--;
                }
        }
@@ -149,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 }
 #endif /* DEBUG */
-void
-xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
-{
-        va_list ap;
-        va_start(ap, fmt);
-        xfs_fs_vcmn_err(level, mp, fmt, ap);
-        va_end(ap);
-}
-void
-xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
-{
-        va_list ap;
-#ifdef DEBUG
-        xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
-#endif
-        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
-            && (level & CE_ALERT)) {
-                level &= ~CE_ALERT;
-                level |= CE_PANIC;
-                cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
-        }
-        va_start(ap, fmt);
-        xfs_fs_vcmn_err(level, mp, fmt, ap);
-        va_end(ap);
-}
 void
 xfs_error_report(
        const char              *tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb82..10dce5475f02 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,16 +127,17 @@ extern void xfs_corruption_error(const char *tag, int level,
 #define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
 #ifdef DEBUG
+extern int xfs_error_test_active;
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 #define XFS_NUM_INJECT_ERROR                            10
 #define XFS_TEST_ERROR(expr, mp, tag, rf)               \
-        ((expr) || \
+        ((expr) || (xfs_error_test_active && \
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
-                        (rf)))
+                        (rf))))
-extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
-extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)       (expr)
 #define xfs_errortag_add(tag, mp)               (ENOSYS)
@@ -161,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 struct xfs_mount;
-extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
-                char *fmt, va_list ap)
-        __attribute__ ((format (printf, 3, 0)));
-extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-                        char *fmt, ...)
-        __attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
 extern void xfs_hex_dump(void *p, int length);
 #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
        xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
 #define xfs_fs_mount_cmn_err(f, fmt, args...) \
-        ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
+        do { \
+                if (!(f & XFS_MFSI_QUIET))      \
+                        cmn_err(CE_WARN, "XFS: " fmt, ## args); \
+        } while (0)
 #endif  /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf562..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
 }
 /*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+        struct xfs_efi_log_item *efip)
+{
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+                spin_lock(&ailp->xa_lock);
+                /* xfs_trans_ail_delete() drops the AIL lock. */
+                xfs_trans_ail_delete(ailp, &efip->efi_item);
+                xfs_efi_item_free(efip);
+        }
+}
+/*
 * This returns the number of iovecs needed to log the given efi item.
 * We only need 1 iovec for an efi item.  It just logs the efi_log_format
 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
        uint                    size;
-        ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+        ASSERT(atomic_read(&efip->efi_next_extent) ==
+                                efip->efi_format.efi_nextents);
        efip->efi_format.efi_type = XFS_LI_EFI;
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
 }
 /*
- * While EFIs cannot really be pinned, the unpin operation is the
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
- * last place at which the EFI is manipulated during a transaction.
+ * which the EFI is manipulated during a transaction.  If we are being asked to
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * remove the EFI it's because the transaction has been cancelled and by
- * free the EFI.
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
 */
 STATIC void
 xfs_efi_item_unpin(
@@ -110,20 +135,15 @@ xfs_efi_item_unpin(
        int                     remove)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-        struct xfs_ail          *ailp = lip->li_ailp;
-        spin_lock(&ailp->xa_lock);
+        if (remove) {
-        if (efip->efi_flags & XFS_EFI_CANCELED) {
+                ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
-                if (remove)
+                if (lip->li_desc)
                        xfs_trans_del_item(lip);
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, lip);
                xfs_efi_item_free(efip);
-        } else {
+                return;
-                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&ailp->xa_lock);
        }
+        __xfs_efi_release(efip);
 }
 /*
@@ -152,16 +172,20 @@ xfs_efi_item_unlock(
 }
 /*
- * The EFI is logged only once and cannot be moved in the log, so
+ * The EFI is logged only once and cannot be moved in the log, so simply return
- * simply return the lsn at which it's been logged.  The canceled
+ * the lsn at which it's been logged.  For bulk transaction committed
- * flag is not paid any attention here.  Checking for that is delayed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * until the EFI is unpinned.
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
 */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
        return lsn;
 }
@@ -230,6 +254,7 @@ xfs_efi_init(
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
+        atomic_set(&efip->efi_next_extent, 0);
        return efip;
 }
@@ -289,37 +314,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 /*
- * This is called by the efd item code below to release references to
+ * This is called by the efd item code below to release references to the given
- * the given efi item.  Each efd calls this with the number of
+ * efi item.  Each efd calls this with the number of extents that it has
- * extents that it has logged, and when the sum of these reaches
+ * logged, and when the sum of these reaches the total number of extents logged
- * the total number of extents logged by this efi item we can free
+ * by this efi item we can free the efi item.
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
 */
 void
 xfs_efi_release(xfs_efi_log_item_t      *efip,
                uint                    nextents)
 {
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
-        int                     extents_left;
+        if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+                __xfs_efi_release(efip);
-        ASSERT(efip->efi_next_extent > 0);
-        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-        spin_lock(&ailp->xa_lock);
-        ASSERT(efip->efi_next_extent >= nextents);
-        efip->efi_next_extent -= nextents;
-        extents_left = efip->efi_next_extent;
-        if (extents_left == 0) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                xfs_efi_item_free(efip);
-        } else {
-                spin_unlock(&ailp->xa_lock);
-        }
 }
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf64..375f68e42531 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
 #define XFS_EFI_MAX_FAST_EXTENTS        16
 /*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
 */
-#define XFS_EFI_RECOVERED       0x1
+#define XFS_EFI_RECOVERED       1
-#define XFS_EFI_COMMITTED       0x2
+#define XFS_EFI_COMMITTED       2
-#define XFS_EFI_CANCELED        0x4
 /*
 * This is the "extent free intention" log item.  It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
 */
 typedef struct xfs_efi_log_item {
        xfs_log_item_t          efi_item;
-        uint                    efi_flags;      /* misc flags */
+        atomic_t                efi_next_extent;
-        uint                    efi_next_extent;
+        unsigned long           efi_flags;      /* misc flags */
        xfs_efi_log_format_t    efi_format;
 } xfs_efi_log_item_t;
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce5699..9124425b7f2f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
         * If the file's parent directory is known, take its iolock in exclusive
         * mode to prevent two sibling files from racing each other to migrate
         * themselves and their parent to different AGs.
+         *
+         * Note that we lock the parent directory iolock inside the child
+         * iolock here.  That's fine as we never hold both parent and child
+         * iolock in any other place.  This is different from the ilock,
+         * which requires locking of the child after the parent for namespace
+         * operations.
         */
        if (pip)
-                xfs_ilock(pip, XFS_IOLOCK_EXCL);
+                xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        /*
         * A new AG needs to be found for the file.  If the file's parent
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 87c2e9d02288..8f6fc1a96386 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -293,9 +293,11 @@ typedef struct xfs_bstat {
        __s32           bs_extsize;     /* extent size                  */
        __s32           bs_extents;     /* number of extents            */
        __u32           bs_gen;         /* generation count             */
-        __u16           bs_projid;      /* project id                   */
+        __u16           bs_projid_lo;   /* lower part of project id     */
+#define bs_projid       bs_projid_lo    /* (previously just bs_projid)  */
        __u16           bs_forkoff;     /* inode fork offset in bytes   */
-        unsigned char   bs_pad[12];     /* pad space, unused            */
+        __u16           bs_projid_hi;   /* higher part of project id    */
+        unsigned char   bs_pad[10];     /* pad space, unused            */
        __u32           bs_dmevmask;    /* DMIG event mask              */
        __u16           bs_dmstate;     /* DMIG state info              */
        __u16           bs_aextents;    /* attribute number of extents  */
@@ -448,6 +450,7 @@ typedef struct xfs_handle {
 /*      XFS_IOC_SETBIOSIZE ---- deprecated 46      */
 /*      XFS_IOC_GETBIOSIZE ---- deprecated 47      */
 #define XFS_IOC_GETBMAPX        _IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE      _IOW ('X', 57, struct xfs_flock64)
 /*
 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 43b1d5699335..cec89dd5d7d2 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -144,12 +144,11 @@ xfs_growfs_data_private(
        if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
                return error;
        dpct = pct - mp->m_sb.sb_imax_pct;
-        error = xfs_read_buf(mp, mp->m_ddev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
-                        XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+                                XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                                BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
-        if (error)
+        if (!bp)
-                return error;
+                return EIO;
-        ASSERT(bp);
        xfs_buf_relse(bp);
        new = nb;       /* use new as a temporary here */
@@ -375,6 +374,7 @@ xfs_growfs_data_private(
                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
        } else
                mp->m_maxicount = 0;
+        xfs_set_low_space_thresholds(mp);
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
@@ -597,7 +597,8 @@ out:
                 * the extra reserve blocks from the reserve.....
                 */
                int error;
-                error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0);
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                                 fdblks_delta, 0);
                if (error == ENOSPC)
                        goto retry;
        }
@@ -611,12 +612,13 @@ out:
 *
 * We cannot use an inode here for this - that will push dirty state back up
 * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead.
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
 */
 int
 xfs_fs_log_dummy(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             flags)
 {
        xfs_trans_t     *tp;
        int             error;
@@ -631,8 +633,7 @@ xfs_fs_log_dummy(
        /* log the UUID because it is an unchanging field */
        xfs_mod_sb(tp, XFS_SB_UUID);
-        if (flags & SYNC_WAIT)
+        xfs_trans_set_sync(tp);
-                xfs_trans_set_sync(tp);
        return xfs_trans_commit(tp, 0);
 }
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5371d2dc360e..0626a32c3447 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -212,7 +212,7 @@ xfs_ialloc_inode_init(
                 *      to log a whole cluster of inodes instead of all the
                 *      individual transactions causing a lot of log traffic.
                 */
-                xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+                xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
                for (i = 0; i < ninodes; i++) {
                        int     ioffset = i << mp->m_sb.sb_inodelog;
                        uint    isize = sizeof(struct xfs_dinode);
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index d352862cefa0..16921f55c542 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -183,38 +183,6 @@ xfs_inobt_key_diff(
                          cur->bc_rec.i.ir_startino;
 }
-STATIC int
-xfs_inobt_kill_root(
-        struct xfs_btree_cur    *cur,
-        struct xfs_buf          *bp,
-        int                     level,
-        union xfs_btree_ptr     *newroot)
-{
-        int                     error;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-        XFS_BTREE_STATS_INC(cur, killroot);
-        /*
-         * Update the root pointer, decreasing the level by 1 and then
-         * free the old root.
-         */
-        xfs_inobt_set_root(cur, newroot, -1);
-        error = xfs_inobt_free_block(cur, bp);
-        if (error) {
-                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                return error;
-        }
-        XFS_BTREE_STATS_INC(cur, free);
-        cur->bc_bufs[level] = NULL;
-        cur->bc_nlevels--;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        return 0;
-}
 #ifdef DEBUG
 STATIC int
 xfs_inobt_keys_inorder(
@@ -309,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
        .dup_cursor             = xfs_inobt_dup_cursor,
        .set_root               = xfs_inobt_set_root,
-        .kill_root              = xfs_inobt_kill_root,
        .alloc_block            = xfs_inobt_alloc_block,
        .free_block             = xfs_inobt_free_block,
        .get_minrecs            = xfs_inobt_get_minrecs,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b1ecc6f97ade..cb9b6d1469f7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
 /*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+/*
 * Allocate and initialise an xfs_inode.
 */
 STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        ASSERT(ip->i_ino == 0);
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_active, "xfs_iolock_active");
        /* initialise the xfs inode */
        ip->i_ino = ino;
@@ -85,12 +99,20 @@ xfs_inode_alloc(
        ip->i_size = 0;
        ip->i_new_size = 0;
-        /* prevent anyone from using this yet */
-        VFS_I(ip)->i_state = I_NEW;
        return ip;
 }
+STATIC void
+xfs_inode_free_callback(
+        struct rcu_head         *head)
+{
+        struct inode            *inode = container_of(head, struct inode, i_rcu);
+        struct xfs_inode        *ip = XFS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 void
 xfs_inode_free(
        struct xfs_inode        *ip)
@@ -134,7 +156,18 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        kmem_zone_free(xfs_inode_zone, ip);
+        /*
+         * Because we use RCU freeing we need to ensure the inode always
+         * appears to be reclaimed with an invalid inode number when in the
+         * free state. The ip->i_flags_lock provides the barrier against lookup
+         * races.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
+        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 /*
@@ -144,14 +177,29 @@ static int
 xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
+        xfs_ino_t               ino,
        int                     flags,
-        int                     lock_flags) __releases(pag->pag_ici_lock)
+        int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
+        /*
+         * check for re-use of an inode within an RCU grace period due to the
+         * radix tree nodes not being updated yet. We monitor for this by
+         * setting the inode number to zero before freeing the inode structure.
+         * If the inode has been reallocated and set up, then the inode number
+         * will not match, so check for that, too.
+         */
        spin_lock(&ip->i_flags_lock);
+        if (ip->i_ino != ino) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
        /*
         * If we are racing with another cache hit that is currently
@@ -194,7 +242,7 @@ xfs_iget_cache_hit(
                ip->i_flags |= XFS_IRECLAIM;
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
@@ -202,7 +250,7 @@ xfs_iget_cache_hit(
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
                        ip->i_flags &= ~XFS_INEW;
@@ -212,14 +260,20 @@ xfs_iget_cache_hit(
                        goto out_error;
                }
-                write_lock(&pag->pag_ici_lock);
+                spin_lock(&pag->pag_ici_lock);
                spin_lock(&ip->i_flags_lock);
                ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
                ip->i_flags |= XFS_INEW;
                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
                inode->i_state = I_NEW;
+                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+                lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                                &xfs_iolock_active, "xfs_iolock_active");
                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
+                spin_unlock(&pag->pag_ici_lock);
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
@@ -230,7 +284,7 @@ xfs_iget_cache_hit(
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
@@ -244,7 +298,7 @@ xfs_iget_cache_hit(
 out_error:
        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -297,7 +351,7 @@ xfs_iget_cache_miss(
                        BUG();
        }
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        /* insert the new inode */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -312,14 +366,14 @@ xfs_iget_cache_miss(
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        *ipp = ip;
        return 0;
 out_preload_end:
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
@@ -365,8 +419,8 @@ xfs_iget(
        xfs_perag_t     *pag;
        xfs_agino_t     agino;
-        /* the radix tree exists only in inode capable AGs */
+        /* reject inode numbers outside existing AGs */
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
        /* get the perag structure and ensure that it's inode capable */
@@ -375,15 +429,15 @@ xfs_iget(
 again:
        error = 0;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
        if (ip) {
-                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                if (error)
                        goto out_error_or_again;
        } else {
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34798f391c49..be7cf625421f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -660,7 +660,8 @@ xfs_dinode_from_disk(
        to->di_uid = be32_to_cpu(from->di_uid);
        to->di_gid = be32_to_cpu(from->di_gid);
        to->di_nlink = be32_to_cpu(from->di_nlink);
-        to->di_projid = be16_to_cpu(from->di_projid);
+        to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+        to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
        to->di_flushiter = be16_to_cpu(from->di_flushiter);
        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
@@ -695,7 +696,8 @@ xfs_dinode_to_disk(
        to->di_uid = cpu_to_be32(from->di_uid);
        to->di_gid = cpu_to_be32(from->di_gid);
        to->di_nlink = cpu_to_be32(from->di_nlink);
-        to->di_projid = cpu_to_be16(from->di_projid);
+        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
        to->di_flushiter = cpu_to_be16(from->di_flushiter);
        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
@@ -874,7 +876,7 @@ xfs_iread(
        if (ip->i_d.di_version == 1) {
                ip->i_d.di_nlink = ip->i_d.di_onlink;
                ip->i_d.di_onlink = 0;
-                ip->i_d.di_projid = 0;
+                xfs_set_projid(ip, 0);
        }
        ip->i_delayed_blks = 0;
@@ -885,7 +887,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-        XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        xfs_buf_set_ref(bp, XFS_INO_REF);
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -982,8 +984,7 @@ xfs_ialloc(
        mode_t          mode,
        xfs_nlink_t     nlink,
        xfs_dev_t       rdev,
-        cred_t          *cr,
+        prid_t          prid,
-        xfs_prid_t      prid,
        int             okalloc,
        xfs_buf_t       **ialloc_context,
        boolean_t       *call_again,
@@ -1027,7 +1028,7 @@ xfs_ialloc(
        ASSERT(ip->i_d.di_nlink == nlink);
        ip->i_d.di_uid = current_fsuid();
        ip->i_d.di_gid = current_fsgid();
-        ip->i_d.di_projid = prid;
+        xfs_set_projid(ip, prid);
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
        /*
@@ -1999,17 +2000,33 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or stale, nothing to do */
+                        /* Inode not in memory, nothing to do */
-                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
+                        if (!ip) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                continue;
                        }
                        /*
+                         * because this is an RCU protected lookup, we could
+                         * find a recently freed or even reallocated inode
+                         * during the lookup. We need to check under the
+                         * i_flags_lock for a valid inode here. Skip it if it
+                         * is not valid, the wrong inode or stale.
+                         */
+                        spin_lock(&ip->i_flags_lock);
+                        if (ip->i_ino != inum + i ||
+                            __xfs_iflags_test(ip, XFS_ISTALE)) {
+                                spin_unlock(&ip->i_flags_lock);
+                                rcu_read_unlock();
+                                continue;
+                        }
+                        spin_unlock(&ip->i_flags_lock);
+                        /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
                         * in the list attached to the buffer and are not
@@ -2018,11 +2035,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2628,7 +2645,7 @@ xfs_iflush_cluster(
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2639,9 +2656,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-                /* if the inode lies outside this cluster, we're done. */
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                /*
-                        break;
+                 * because this is an RCU protected lookup, we could find a
+                 * recently freed or even reallocated inode during the lookup.
+                 * We need to check under the i_flags_lock for a valid inode
+                 * here. Skip it if it is not valid or the wrong inode.
+                 */
+                spin_lock(&ip->i_flags_lock);
+                if (!ip->i_ino ||
+                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                        spin_unlock(&ip->i_flags_lock);
+                        continue;
+                }
+                spin_unlock(&ip->i_flags_lock);
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2691,7 +2720,7 @@ xfs_iflush_cluster(
        }
 out_free:
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2703,7 +2732,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the
@@ -2725,7 +2754,7 @@ cluster_corrupt_out:
                        XFS_BUF_UNDONE(bp);
                        XFS_BUF_STALE(bp);
                        XFS_BUF_ERROR(bp,EIO);
-                        xfs_biodone(bp);
+                        xfs_buf_ioend(bp, 0);
                } else {
                        XFS_BUF_STALE(bp);
                        xfs_buf_relse(bp);
@@ -3008,7 +3037,7 @@ xfs_iflush_int(
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
                        memset(&(dip->di_pad[0]), 0,
                              sizeof(dip->di_pad));
-                        ASSERT(ip->i_d.di_projid == 0);
+                        ASSERT(xfs_get_projid(ip) == 0);
                }
        }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0898c5417d12..5c95fa8ec11d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -134,8 +134,9 @@ typedef struct xfs_icdinode {
        __uint32_t      di_uid;         /* owner's user id */
        __uint32_t      di_gid;         /* owner's group id */
        __uint32_t      di_nlink;       /* number of links to file */
-        __uint16_t      di_projid;      /* owner's project id */
+        __uint16_t      di_projid_lo;   /* lower part of owner's project id */
-        __uint8_t       di_pad[8];      /* unused, zeroed space */
+        __uint16_t      di_projid_hi;   /* higher part of owner's project id */
+        __uint8_t       di_pad[6];      /* unused, zeroed space */
        __uint16_t      di_flushiter;   /* incremented on flush */
        xfs_ictimestamp_t di_atime;     /* time last accessed */
        xfs_ictimestamp_t di_mtime;     /* time last modified */
@@ -212,7 +213,6 @@ typedef struct xfs_icdinode {
 #ifdef __KERNEL__
 struct bhv_desc;
-struct cred;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -335,6 +335,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 }
 /*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was choosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline prid_t
+xfs_get_projid(struct xfs_inode *ip)
+{
+        return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
+}
+static inline void
+xfs_set_projid(struct xfs_inode *ip,
+                prid_t projid)
+{
+        ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
+        ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
+}
+/*
 * Manage the i_flush queue embedded in the inode.  This completion
 * queue synchronizes processes attempting to flush the in-core
 * inode back to disk.
@@ -357,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * In-core inode flags.
 */
-#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
+#define XFS_IRECLAIM            0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE      0x0002  /* inode has been staled */
+#define XFS_ISTALE              0x0002  /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_IRECLAIMABLE        0x0004  /* inode can be reclaimed */
-#define XFS_INEW        0x0008  /* inode has just been allocated */
+#define XFS_INEW                0x0008  /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010  /* inode is in a filestream directory */
+#define XFS_IFILESTREAM         0x0010  /* inode is in a filestream directory */
-#define XFS_ITRUNCATED  0x0020  /* truncated down so flush-on-close */
+#define XFS_ITRUNCATED          0x0020  /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE      0x0040  /* dirty release already seen */
 /*
 * Flags for inode locking.
@@ -419,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+extern struct lock_class_key xfs_iolock_reclaimable;
 /*
 * Flags for xfs_itruncate_start().
 */
@@ -456,8 +478,8 @@ void		xfs_inode_free(struct xfs_inode *ip);
 * xfs_inode.c prototypes.
 */
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
-                           xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
+                           xfs_nlink_t, xfs_dev_t, prid_t, int,
-                           int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
+                           struct xfs_buf **, boolean_t *, xfs_inode_t **);
 uint            xfs_ip2xflags(struct xfs_inode *);
 uint            xfs_dic2xflags(struct xfs_dinode *);
@@ -471,7 +493,6 @@ int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void            xfs_iext_realloc(xfs_inode_t *, int, int);
 void            xfs_iunpin_wait(xfs_inode_t *);
 int             xfs_iflush(xfs_inode_t *, uint);
-void            xfs_ichgtime(xfs_inode_t *, int);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
 void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
@@ -482,7 +503,7 @@ void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-        atomic_inc(&(VFS_I(ip)->i_count)); \
+        ihold(VFS_I(ip)); \
        trace_xfs_ihold(ip, _THIS_IP_); \
 } while (0)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fe00777e2796..fd4f398bd6f1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -223,15 +223,6 @@ xfs_inode_item_format(
        nvecs        = 1;
        /*
-         * Make sure the linux inode is dirty. We do this before
-         * clearing i_update_core as the VFS will call back into
-         * XFS here and set i_update_core, so we need to dirty the
-         * inode first so that the ordering of i_update_core and
-         * unlogged modifications still works as described below.
-         */
-        xfs_mark_inode_dirty_sync(ip);
-        /*
         * Clear i_update_core if the timestamps (or any other
         * non-transactional modification) need flushing/logging
         * and we're about to log them with the rest of the core.
@@ -666,18 +657,37 @@ xfs_inode_item_unlock(
 }
 /*
- * This is called to find out where the oldest active copy of the
+ * This is called to find out where the oldest active copy of the inode log
- * inode log item in the on disk log resides now that the last log
+ * item in the on disk log resides now that the last log write of it completed
- * write of it completed at the given lsn.  Since we always re-log
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
- * all dirty data in an inode, the latest copy in the on disk log
+ * latest copy in the on disk log is the only one that matters.  Therefore,
- * is the only one that matters.  Therefore, simply return the
+ * simply return the given lsn.
- * given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completions before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, return a lower LSN than the one passed in so that the
+ * transaction committed code will not move the inode forward in the AIL but
+ * will still unpin it properly.
 */
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
+        if (xfs_iflags_test(ip, XFS_ISTALE))
+                return lsn - 1;
        return lsn;
 }
@@ -832,15 +842,64 @@ xfs_inode_item_destroy(
 * flushed to disk.  It is responsible for removing the inode item
 * from the AIL if it has not been re-logged, and unlocking the inode's
 * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
 */
 void
 xfs_iflush_done(
        struct xfs_buf          *bp,
        struct xfs_log_item     *lip)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode_log_item *iip;
-        xfs_inode_t             *ip = iip->ili_inode;
+        struct xfs_log_item     *blip;
+        struct xfs_log_item     *next;
+        struct xfs_log_item     *prev;
        struct xfs_ail          *ailp = lip->li_ailp;
+        int                     need_ail = 0;
+        /*
+         * Scan the buffer IO completions for other inodes being completed and
+         * attach them to the current inode log item.
+         */
+        blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+        prev = NULL;
+        while (blip != NULL) {
+                if (lip->li_cb != xfs_iflush_done) {
+                        prev = blip;
+                        blip = blip->li_bio_list;
+                        continue;
+                }
+                /* remove from list */
+                next = blip->li_bio_list;
+                if (!prev) {
+                        XFS_BUF_SET_FSPRIVATE(bp, next);
+                } else {
+                        prev->li_bio_list = next;
+                }
+                /* add to current list */
+                blip->li_bio_list = lip->li_bio_list;
+                lip->li_bio_list = blip;
+                /*
+                 * while we have the item, do the unlocked check for needing
+                 * the AIL lock.
+                 */
+                iip = INODE_ITEM(blip);
+                if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+                        need_ail++;
+                blip = next;
+        }
+        /* make sure we capture the state of the initial inode. */
+        iip = INODE_ITEM(lip);
+        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+                need_ail++;
        /*
         * We only want to pull the item from the AIL if it is
@@ -851,28 +910,37 @@ xfs_iflush_done(
         * the lock since it's cheaper, and then we recheck while
         * holding the lock before removing the inode from the AIL.
         */
-        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
+        if (need_ail) {
+                struct xfs_log_item *log_items[need_ail];
+                int i = 0;
                spin_lock(&ailp->xa_lock);
-                if (lip->li_lsn == iip->ili_flush_lsn) {
+                for (blip = lip; blip; blip = blip->li_bio_list) {
-                        /* xfs_trans_ail_delete() drops the AIL lock. */
+                        iip = INODE_ITEM(blip);
-                        xfs_trans_ail_delete(ailp, lip);
+                        if (iip->ili_logged &&
-                } else {
+                            blip->li_lsn == iip->ili_flush_lsn) {
-                        spin_unlock(&ailp->xa_lock);
+                                log_items[i++] = blip;
+                        }
+                        ASSERT(i <= need_ail);
                }
+                /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+                xfs_trans_ail_delete_bulk(ailp, log_items, i);
        }
-        iip->ili_logged = 0;
        /*
-         * Clear the ili_last_fields bits now that we know that the
+         * clean up and unlock the flush lock now we are done. We can clear the
-         * data corresponding to them is safely on disk.
+         * ili_last_fields bits now that we know that the data corresponding to
+         * them is safely on disk.
         */
-        iip->ili_last_fields = 0;
+        for (blip = lip; blip; blip = next) {
+                next = blip->li_bio_list;
+                blip->li_bio_list = NULL;
-        /*
+                iip = INODE_ITEM(blip);
-         * Release the inode's flush lock since we're done with it.
+                iip->ili_logged = 0;
-         */
+                iip->ili_last_fields = 0;
-        xfs_ifunlock(ip);
+                xfs_ifunlock(iip->ili_inode);
+        }
 }
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369f..8a0f044750c3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
-int
-xfs_iomap(
-        struct xfs_inode        *ip,
-        xfs_off_t               offset,
-        ssize_t                 count,
-        int                     flags,
-        struct xfs_bmbt_irec    *imap,
-        int                     *nimaps,
-        int                     *new)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           offset_fsb, end_fsb;
-        int                     error = 0;
-        int                     lockmode = 0;
-        int                     bmapi_flags = 0;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        *new = 0;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
-        switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
-        case BMAPI_READ:
-                lockmode = xfs_ilock_map_shared(ip);
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                break;
-        case BMAPI_WRITE:
-                lockmode = XFS_ILOCK_EXCL;
-                if (flags & BMAPI_IGNSTATE)
-                        bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
-                xfs_ilock(ip, lockmode);
-                break;
-        case BMAPI_ALLOCATE:
-                lockmode = XFS_ILOCK_SHARED;
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                /* Attempt non-blocking lock */
-                if (flags & BMAPI_TRYLOCK) {
-                        if (!xfs_ilock_nowait(ip, lockmode))
-                                return XFS_ERROR(EAGAIN);
-                } else {
-                        xfs_ilock(ip, lockmode);
-                }
-                break;
-        default:
-                BUG();
-        }
-        ASSERT(offset <= mp->m_maxioffset);
-        if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-                count = mp->m_maxioffset - offset;
-        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_bmapi(NULL, ip, offset_fsb,
-                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, imap,
-                        nimaps, NULL);
-        if (error)
-                goto out;
-        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
-        case BMAPI_WRITE:
-                /* If we found an extent, return it */
-                if (*nimaps &&
-                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                if (flags & BMAPI_DIRECT) {
-                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       imap, nimaps);
-                } else {
-                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      imap, nimaps);
-                }
-                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
-                }
-                *new = 1;
-                break;
-        case BMAPI_ALLOCATE:
-                /* If we found an extent, return it */
-                xfs_iunlock(ip, lockmode);
-                lockmode = 0;
-                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 imap, nimaps);
-                break;
-        }
-        ASSERT(*nimaps <= 1);
-out:
-        if (lockmode)
-                xfs_iunlock(ip, lockmode);
-        return XFS_ERROR(error);
-}
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-STATIC int
+int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             flags,
        xfs_bmbt_irec_t *imap,
-        int             *nmaps)
+        int             nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+                if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        imap->br_blockcount +
                                        imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip);
        bmapi_flag = XFS_BMAPI_WRITE;
-        if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
+        if (offset < ip->i_size || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
                goto error_out;
        }
-        *nmaps = 1;
        return 0;
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-        *nmaps = 0;     /* nothing set-up here */
 error_out:
        return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
 * If the caller is doing a write at the end of the file, then extend the
 * allocation out to the file system's write iosize.  We clean up any extra
 * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
 */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
        xfs_bmbt_irec_t *imap,
        int             nimaps,
        int             *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
        xfs_filblks_t   count_fsb;
        xfs_fsblock_t   firstblock;
        int             n, error, imaps;
+        int             found_delalloc = 0;
        *prealloc = 0;
        if ((offset + count) <= ip->i_size)
@@ -429,20 +310,71 @@ xfs_iomap_eof_want_preallocate(
                                return 0;
                        start_fsb += imap[n].br_blockcount;
                        count_fsb -= imap[n].br_blockcount;
+                        if (imap[n].br_startblock == DELAYSTARTBLOCK)
+                                found_delalloc = 1;
                }
        }
-        *prealloc = 1;
+        if (!found_delalloc)
+                *prealloc = 1;
        return 0;
 }
-STATIC int
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *ip)
+{
+        xfs_fsblock_t           alloc_blocks = 0;
+        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+                int shift = 0;
+                int64_t freesp;
+                /*
+                 * rounddown_pow_of_two() returns an undefined result
+                 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
+                 * ensure we always pass in a non-zero value.
+                 */
+                alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+                                        rounddown_pow_of_two(alloc_blocks));
+                xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+                freesp = mp->m_sb.sb_fdblocks;
+                if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+                        shift = 2;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+                                shift++;
+                }
+                if (shift)
+                        alloc_blocks >>= shift;
+        }
+        if (alloc_blocks < mp->m_writeio_blocks)
+                alloc_blocks = mp->m_writeio_blocks;
+        return alloc_blocks;
+}
+int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
+        xfs_bmbt_irec_t *ret_imap)
-        xfs_bmbt_irec_t *ret_imap,
-        int             *nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -469,16 +401,19 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+                                imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
 retry:
        if (prealloc) {
+                xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-                last_fsb = ioalign + mp->m_writeio_blocks;
+                last_fsb = ioalign + alloc_blocks;
        } else {
                last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
        }
@@ -496,22 +431,31 @@ retry:
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
                          &nimaps, NULL);
-        if (error && (error != ENOSPC))
+        switch (error) {
+        case 0:
+        case ENOSPC:
+        case EDQUOT:
+                break;
+        default:
                return XFS_ERROR(error);
+        }
        /*
-         * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-         * then we must have run out of space - flush all other inodes with
+         * ENOSPC, * flush all other inodes with delalloc blocks to free up
-         * delalloc blocks and retry without EOF preallocation.
+         * some of the excess reserved metadata space. For both cases, retry
+         * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
                if (flushed)
-                        return XFS_ERROR(ENOSPC);
+                        return XFS_ERROR(error ? error : ENOSPC);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error == ENOSPC) {
-                xfs_flush_inodes(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        xfs_flush_inodes(ip);
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                }
                flushed = 1;
                error = 0;
@@ -523,8 +467,6 @@ retry:
                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
        *ret_imap = imap[0];
-        *nmaps = 1;
        return 0;
 }
@@ -538,13 +480,12 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-STATIC int
+int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        xfs_bmbt_irec_t *imap,
+        xfs_bmbt_irec_t *imap)
-        int             *retmap)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb, last_block;
@@ -557,8 +498,6 @@ xfs_iomap_write_allocate(
        int             error = 0;
        int             nres;
-        *retmap = 0;
        /*
         * Make sure that the dquots are there.
         */
@@ -680,7 +619,6 @@ xfs_iomap_write_allocate(
                if ((offset_fsb >= imap->br_startoff) &&
                    (offset_fsb < (imap->br_startoff +
                                   imap->br_blockcount))) {
-                        *retmap = 1;
                        XFS_STATS_INC(xs_xstrat_quick);
                        return 0;
                }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50d..80615760959a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-/* base extent manipulation calls */
-#define BMAPI_READ      (1 << 0)        /* read extents */
-#define BMAPI_WRITE     (1 << 1)        /* create extents */
-#define BMAPI_ALLOCATE  (1 << 2)        /* delayed allocate to real extents */
-/* modifiers */
-#define BMAPI_IGNSTATE  (1 << 4)        /* ignore unwritten state on read */
-#define BMAPI_DIRECT    (1 << 5)        /* direct instead of buffered write */
-#define BMAPI_MMA       (1 << 6)        /* allocate for mmap write */
-#define BMAPI_TRYLOCK   (1 << 7)        /* non-blocking request */
-#define BMAPI_FLAGS \
-        { BMAPI_READ,           "READ" }, \
-        { BMAPI_WRITE,          "WRITE" }, \
-        { BMAPI_ALLOCATE,       "ALLOCATE" }, \
-        { BMAPI_IGNSTATE,       "IGNSTATE" }, \
-        { BMAPI_DIRECT,         "DIRECT" }, \
-        { BMAPI_TRYLOCK,        "TRYLOCK" }
 struct xfs_inode;
 struct xfs_bmbt_irec;
-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                     struct xfs_bmbt_irec *, int *, int *);
+                        struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7e3626e5925c..dc1882adaf54 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -92,7 +92,8 @@ xfs_bulkstat_one_int(
         * further change.
         */
        buf->bs_nlink = dic->di_nlink;
-        buf->bs_projid = dic->di_projid;
+        buf->bs_projid_lo = dic->di_projid_lo;
+        buf->bs_projid_hi = dic->di_projid_hi;
        buf->bs_ino = ino;
        buf->bs_mode = dic->di_mode;
        buf->bs_uid = dic->di_uid;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 33f718f92a48..ae6fef1ff563 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
                                xfs_buftarg_t   *log_target,
                                xfs_daddr_t     blk_offset,
                                int             num_bblks);
-STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int       xlog_space_left(struct log *log, atomic64_t *head);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
 STATIC int  xlog_grant_log_space(xlog_t         *log,
                                 xlog_ticket_t  *xtic);
-STATIC void xlog_grant_push_ail(xfs_mount_t     *mp,
+STATIC void xlog_grant_push_ail(struct log      *log,
                                int             need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t        *log,
                                           xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void     xlog_verify_grant_tail(struct log *log);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
 STATIC void     xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
                                     xfs_lsn_t tail_lsn);
 #else
 #define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_head(a,b)
+#define xlog_verify_grant_tail(a)
 #define xlog_verify_iclog(a,b,c,d)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 STATIC int      xlog_iclogs_empty(xlog_t *log);
 static void
-xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+xlog_grant_sub_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        if (*qp) {
+        int64_t head_val = atomic64_read(head);
-                tic->t_next         = (*qp);
+        int64_t new, old;
-                tic->t_prev         = (*qp)->t_prev;
-                (*qp)->t_prev->t_next = tic;
-                (*qp)->t_prev       = tic;
-        } else {
-                tic->t_prev = tic->t_next = tic;
-                *qp = tic;
-        }
-        tic->t_flags |= XLOG_TIC_IN_Q;
+        do {
-}
+                int     cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
-{
-        if (tic == tic->t_next) {
-                *qp = NULL;
-        } else {
-                *qp = tic->t_next;
-                tic->t_next->t_prev = tic->t_prev;
-                tic->t_prev->t_next = tic->t_next;
-        }
-        tic->t_next = tic->t_prev = NULL;
+                space -= bytes;
-        tic->t_flags &= ~XLOG_TIC_IN_Q;
+                if (space < 0) {
+                        space += log->l_logsize;
+                        cycle--;
+                }
+                old = head_val;
+                new = xlog_assign_grant_head_val(cycle, space);
+                head_val = atomic64_cmpxchg(head, old, new);
+        } while (head_val != old);
 }
 static void
-xlog_grant_sub_space(struct log *log, int bytes)
+xlog_grant_add_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        log->l_grant_write_bytes -= bytes;
+        int64_t head_val = atomic64_read(head);
-        if (log->l_grant_write_bytes < 0) {
+        int64_t new, old;
-                log->l_grant_write_bytes += log->l_logsize;
-                log->l_grant_write_cycle--;
-        }
-        log->l_grant_reserve_bytes -= bytes;
+        do {
-        if ((log)->l_grant_reserve_bytes < 0) {
+                int             tmp;
-                log->l_grant_reserve_bytes += log->l_logsize;
+                int             cycle, space;
-                log->l_grant_reserve_cycle--;
-        }
-}
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-static void
+                tmp = log->l_logsize - space;
-xlog_grant_add_space_write(struct log *log, int bytes)
+                if (tmp > bytes)
-{
+                        space += bytes;
-        int tmp = log->l_logsize - log->l_grant_write_bytes;
+                else {
-        if (tmp > bytes)
+                        space = bytes - tmp;
-                log->l_grant_write_bytes += bytes;
+                        cycle++;
-        else {
+                }
-                log->l_grant_write_cycle++;
-                log->l_grant_write_bytes = bytes - tmp;
-        }
-}
-static void
-xlog_grant_add_space_reserve(struct log *log, int bytes)
-{
-        int tmp = log->l_logsize - log->l_grant_reserve_bytes;
-        if (tmp > bytes)
-                log->l_grant_reserve_bytes += bytes;
-        else {
-                log->l_grant_reserve_cycle++;
-                log->l_grant_reserve_bytes = bytes - tmp;
-        }
-}
-static inline void
+                old = head_val;
-xlog_grant_add_space(struct log *log, int bytes)
+                new = xlog_assign_grant_head_val(cycle, space);
-{
+                head_val = atomic64_cmpxchg(head, old, new);
-        xlog_grant_add_space_write(log, bytes);
+        } while (head_val != old);
-        xlog_grant_add_space_reserve(log, bytes);
 }
 static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+                xlog_grant_push_ail(log, internal_ticket->t_unit_res);
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp,
+                xlog_grant_push_ail(log,
                                    (internal_ticket->t_unit_res *
                                     internal_ticket->t_cnt));
                retval = xlog_grant_log_space(log, internal_ticket);
@@ -402,7 +377,7 @@ xfs_log_mount(
                cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
        else {
                cmn_err(CE_NOTE,
-                        "!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+                        "Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
                        mp->m_fsname);
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                        } else {
                                spin_unlock(&log->l_icloglock);
                        }
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                } else {
                        spin_unlock(&log->l_icloglock);
                }
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 {
        xlog_ticket_t   *tic;
        xlog_t          *log = mp->m_log;
-        int             need_bytes, free_bytes, cycle, bytes;
+        int             need_bytes, free_bytes;
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
-        if (tail_lsn == 0) {
+        if (tail_lsn == 0)
-                /* needed since sync_lsn is 64 bits */
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                spin_lock(&log->l_icloglock);
-                tail_lsn = log->l_last_sync_lsn;
-                spin_unlock(&log->l_icloglock);
-        }
-        spin_lock(&log->l_grant_lock);
+        /* tail_lsn == 1 implies that we weren't passed a valid value.  */
+        if (tail_lsn != 1)
-        /* Also an invalid lsn.  1 implies that we aren't passing in a valid
+                atomic64_set(&log->l_tail_lsn, tail_lsn);
-         * tail_lsn.
-         */
-        if (tail_lsn != 1) {
-                log->l_tail_lsn = tail_lsn;
-        }
-        if ((tic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_write_cycle;
+                spin_lock(&log->l_grant_write_lock);
-                bytes = log->l_grant_write_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_writeq, t_queue) {
-                do {
                        ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < tic->t_unit_res && tail_lsn != 1)
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_regrant_write_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_write_headq);
+                }
+                spin_unlock(&log->l_grant_write_lock);
        }
-        if ((tic = log->l_reserve_headq)) {
+        if (!list_empty_careful(&log->l_reserveq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_reserve_cycle;
+                spin_lock(&log->l_grant_reserve_lock);
-                bytes = log->l_grant_reserve_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_reserveq, t_queue) {
-                do {
                        if (tic->t_flags & XLOG_TIC_PERM_RESERV)
                                need_bytes = tic->t_unit_res*tic->t_cnt;
                        else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_grant_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_reserve_headq);
+                }
+                spin_unlock(&log->l_grant_reserve_lock);
        }
-        spin_unlock(&log->l_grant_lock);
+}
-}       /* xfs_log_move_tail */
 /*
 * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
 * We may be holding the log iclog lock upon entering this routine.
 */
 xfs_lsn_t
-xlog_assign_tail_lsn(xfs_mount_t *mp)
+xlog_assign_tail_lsn(
+        struct xfs_mount        *mp)
 {
-        xfs_lsn_t tail_lsn;
+        xfs_lsn_t               tail_lsn;
-        xlog_t    *log = mp->m_log;
+        struct log              *log = mp->m_log;
        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
-        spin_lock(&log->l_grant_lock);
+        if (!tail_lsn)
-        if (tail_lsn != 0) {
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                log->l_tail_lsn = tail_lsn;
-        } else {
-                tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
-        }
-        spin_unlock(&log->l_grant_lock);
+        atomic64_set(&log->l_tail_lsn, tail_lsn);
        return tail_lsn;
-}       /* xlog_assign_tail_lsn */
+}
 /*
 * Return the space in the log between the tail and the head.  The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
 * result is that we return the size of the log as the amount of space left.
 */
 STATIC int
-xlog_space_left(xlog_t *log, int cycle, int bytes)
+xlog_space_left(
-{
+        struct log      *log,
-        int free_bytes;
+        atomic64_t      *head)
-        int tail_bytes;
+{
-        int tail_cycle;
+        int             free_bytes;
+        int             tail_bytes;
-        tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
+        int             tail_cycle;
-        tail_cycle = CYCLE_LSN(log->l_tail_lsn);
+        int             head_cycle;
-        if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+        int             head_bytes;
-                free_bytes = log->l_logsize - (bytes - tail_bytes);
-        } else if ((tail_cycle + 1) < cycle) {
+        xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+        tail_bytes = BBTOB(tail_bytes);
+        if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+                free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+        else if (tail_cycle + 1 < head_cycle)
                return 0;
-        } else if (tail_cycle < cycle) {
+        else if (tail_cycle < head_cycle) {
-                ASSERT(tail_cycle == (cycle - 1));
+                ASSERT(tail_cycle == (head_cycle - 1));
-                free_bytes = tail_bytes - bytes;
+                free_bytes = tail_bytes - head_bytes;
        } else {
                /*
                 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
                        "xlog_space_left: head behind tail\n"
                        "  tail_cycle = %d, tail_bytes = %d\n"
                        "  GH   cycle = %d, GH   bytes = %d",
-                        tail_cycle, tail_bytes, cycle, bytes);
+                        tail_cycle, tail_bytes, head_cycle, head_bytes);
                ASSERT(0);
                free_bytes = log->l_logsize;
        }
        return free_bytes;
-}       /* xlog_space_left */
+}
 /*
@@ -917,19 +884,6 @@ xlog_iodone(xfs_buf_t *bp)
        l = iclog->ic_log;
        /*
-         * If the _XFS_BARRIER_FAILED flag was set by a lower
-         * layer, it means the underlying device no longer supports
-         * barrier I/O. Warn loudly and turn off barriers.
-         */
-        if (bp->b_flags & _XFS_BARRIER_FAILED) {
-                bp->b_flags &= ~_XFS_BARRIER_FAILED;
-                l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                xfs_fs_cmn_err(CE_WARN, l->l_mp,
-                                "xlog_iodone: Barriers are no longer supported"
-                                " by device. Disabling barriers\n");
-        }
-        /*
         * Race to shutdown the filesystem if we see an error.
         */
        if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
@@ -1060,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
        log->l_prev_block  = -1;
-        log->l_tail_lsn    = xlog_assign_lsn(1, 0);
        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
-        log->l_last_sync_lsn = log->l_tail_lsn;
+        xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+        xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
-        log->l_grant_reserve_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
-        log->l_grant_write_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+        INIT_LIST_HEAD(&log->l_reserveq);
+        INIT_LIST_HEAD(&log->l_writeq);
+        spin_lock_init(&log->l_grant_reserve_lock);
+        spin_lock_init(&log->l_grant_write_lock);
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1107,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_xbuf = bp;
        spin_lock_init(&log->l_icloglock);
-        spin_lock_init(&log->l_grant_lock);
+        init_waitqueue_head(&log->l_flush_wait);
-        sv_init(&log->l_flush_wait, 0, "flush_wait");
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1131,7 +1088,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                iclog->ic_prev = prev_iclog;
                prev_iclog = iclog;
-                bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
+                bp = xfs_buf_get_uncached(mp->m_logdev_targp,
+                                                log->l_iclog_size, 0);
                if (!bp)
                        goto out_free_iclog;
                if (!XFS_BUF_CPSEMA(bp))
@@ -1163,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
+                init_waitqueue_head(&iclog->ic_force_wait);
-                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+                init_waitqueue_head(&iclog->ic_write_wait);
                iclogp = &iclog->ic_next;
        }
@@ -1179,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t	*mp,
 out_free_iclog:
        for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
                prev_iclog = iclog->ic_next;
-                if (iclog->ic_bp) {
+                if (iclog->ic_bp)
-                        sv_destroy(&iclog->ic_force_wait);
-                        sv_destroy(&iclog->ic_write_wait);
                        xfs_buf_free(iclog->ic_bp);
-                }
                kmem_free(iclog);
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
@@ -1235,61 +1189,60 @@ xlog_commit_record(
 * water mark.  In this manner, we would be creating a low water mark.
 */
 STATIC void
-xlog_grant_push_ail(xfs_mount_t *mp,
+xlog_grant_push_ail(
-                    int         need_bytes)
+        struct log      *log,
+        int             need_bytes)
 {
-    xlog_t      *log = mp->m_log;       /* pointer to the log */
+        xfs_lsn_t       threshold_lsn = 0;
-    xfs_lsn_t   tail_lsn;               /* lsn of the log tail */
+        xfs_lsn_t       last_sync_lsn;
-    xfs_lsn_t   threshold_lsn = 0;      /* lsn we'd like to be at */
+        int             free_blocks;
-    int         free_blocks;            /* free blocks left to write to */
+        int             free_bytes;
-    int         free_bytes;             /* free bytes left to write to */
+        int             threshold_block;
-    int         threshold_block;        /* block in lsn we'd like to be at */
+        int             threshold_cycle;
-    int         threshold_cycle;        /* lsn cycle we'd like to be at */
+        int             free_threshold;
-    int         free_threshold;
+        ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-    spin_lock(&log->l_grant_lock);
+        free_blocks = BTOBBT(free_bytes);
-    free_bytes = xlog_space_left(log,
-                                 log->l_grant_reserve_cycle,
+        /*
-                                 log->l_grant_reserve_bytes);
+         * Set the threshold for the minimum number of free blocks in the
-    tail_lsn = log->l_tail_lsn;
+         * log to the maximum of what the caller needs, one quarter of the
-    free_blocks = BTOBBT(free_bytes);
+         * log, and 256 blocks.
+         */
-    /*
+        free_threshold = BTOBB(need_bytes);
-     * Set the threshold for the minimum number of free blocks in the
+        free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
-     * log to the maximum of what the caller needs, one quarter of the
+        free_threshold = MAX(free_threshold, 256);
-     * log, and 256 blocks.
+        if (free_blocks >= free_threshold)
-     */
+                return;
-    free_threshold = BTOBB(need_bytes);
-    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
-    free_threshold = MAX(free_threshold, 256);
+                                                &threshold_block);
-    if (free_blocks < free_threshold) {
+        threshold_block += free_threshold;
-        threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
-        threshold_cycle = CYCLE_LSN(tail_lsn);
        if (threshold_block >= log->l_logBBsize) {
-            threshold_block -= log->l_logBBsize;
+                threshold_block -= log->l_logBBsize;
-            threshold_cycle += 1;
+                threshold_cycle += 1;
        }
-        threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
+        threshold_lsn = xlog_assign_lsn(threshold_cycle,
+                                        threshold_block);
+        /*
+         * Don't pass in an lsn greater than the lsn of the last
+         * log record known to be on disk. Use a snapshot of the last sync lsn
+         * so that it doesn't change between the compare and the set.
+         */
+        last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+        if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+                threshold_lsn = last_sync_lsn;
-        /* Don't pass in an lsn greater than the lsn of the last
+        /*
-         * log record known to be on disk.
+         * Get the transaction layer to kick the dirty buffers out to
+         * disk asynchronously. No point in trying to do this if
+         * the filesystem is shutting down.
         */
-        if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
+        if (!XLOG_FORCED_SHUTDOWN(log))
-            threshold_lsn = log->l_last_sync_lsn;
+                xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-    }
+}
-    spin_unlock(&log->l_grant_lock);
-    /*
-     * Get the transaction layer to kick the dirty buffers out to
-     * disk asynchronously. No point in trying to do this if
-     * the filesystem is shutting down.
-     */
-    if (threshold_lsn &&
-        !XLOG_FORCED_SHUTDOWN(log))
-            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-}       /* xlog_grant_push_ail */
 /*
 * The bdstrat callback function for log bufs. This gives us a central
@@ -1309,7 +1262,7 @@ xlog_bdstrat(
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                XFS_BUF_ERROR(bp, EIO);
                XFS_BUF_STALE(bp);
-                xfs_biodone(bp);
+                xfs_buf_ioend(bp, 0);
                /*
                 * It would seem logical to return EIO here, but we rely on
                 * the log state machine to propagate I/O errors instead of
@@ -1384,9 +1337,8 @@ xlog_sync(xlog_t		*log,
                 roundoff < BBTOB(1)));
        /* move grant heads by roundoff in sync */
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
-        xlog_grant_add_space(log, roundoff);
+        xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
-        spin_unlock(&log->l_grant_lock);
        /* put cycle number in every block */
        xlog_pack_data(log, iclog, roundoff); 
@@ -1501,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-                sv_destroy(&iclog->ic_force_wait);
-                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
@@ -2244,7 +2193,7 @@ xlog_state_do_callback(
                                lowest_lsn = xlog_get_lowest_lsn(log);
                                if (lowest_lsn &&
                                    XFS_LSN_CMP(lowest_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
                                        iclog = iclog->ic_next;
                                        continue; /* Leave this iclog for
                                                   * another thread */
@@ -2252,23 +2201,21 @@ xlog_state_do_callback(
                                iclog->ic_state = XLOG_STATE_CALLBACK;
-                                spin_unlock(&log->l_icloglock);
-                                /* l_last_sync_lsn field protected by
+                                /*
-                                 * l_grant_lock. Don't worry about iclog's lsn.
+                                 * update the last_sync_lsn before we drop the
-                                 * No one else can be here except us.
+                                 * icloglock to ensure we are the only one that
+                                 * can update it.
                                 */
-                                spin_lock(&log->l_grant_lock);
+                                ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-                                ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
+                                        be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                                       be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+                                atomic64_set(&log->l_last_sync_lsn,
-                                log->l_last_sync_lsn =
+                                        be64_to_cpu(iclog->ic_header.h_lsn));
-                                        be64_to_cpu(iclog->ic_header.h_lsn);
-                                spin_unlock(&log->l_grant_lock);
-                        } else {
+                        } else
-                                spin_unlock(&log->l_icloglock);
                                ioerrors++;
-                        }
+                        spin_unlock(&log->l_icloglock);
                        /*
                         * Keep processing entries in the callback list until
@@ -2309,7 +2256,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
                        /* wake up threads waiting in xfs_log_force() */
-                        sv_broadcast(&iclog->ic_force_wait);
+                        wake_up_all(&iclog->ic_force_wait);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2356,7 +2303,7 @@ xlog_state_do_callback(
        spin_unlock(&log->l_icloglock);
        if (wake)
-                sv_broadcast(&log->l_flush_wait);
+                wake_up_all(&log->l_flush_wait);
 }
@@ -2407,7 +2354,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-        sv_broadcast(&iclog->ic_write_wait);
+        wake_up_all(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }       /* xlog_state_done_syncing */
@@ -2456,7 +2403,7 @@ restart:
                XFS_STATS_INC(xs_log_noiclogs);
                /* Wait for log writes to have flushed */
-                sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
+                xlog_wait(&log->l_flush_wait, &log->l_icloglock);
                goto restart;
        }
@@ -2539,6 +2486,18 @@ restart:
 *
 * Once a ticket gets put onto the reserveq, it will only return after
 * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
 */
 STATIC int
 xlog_grant_log_space(xlog_t        *log,
@@ -2546,24 +2505,27 @@ xlog_grant_log_space(xlog_t	   *log,
 {
        int              free_bytes;
        int              need_bytes;
-#ifdef DEBUG
-        xfs_lsn_t        tail_lsn;
-#endif
 #ifdef DEBUG
        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                panic("grant Recovery problem");
 #endif
-        /* Is there space or do we need to sleep? */
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_grant_enter(log, tic);
+        need_bytes = tic->t_unit_res;
+        if (tic->t_flags & XFS_LOG_PERM_RESERV)
+                need_bytes *= tic->t_ocnt;
        /* something is already sleeping; insert new transaction at end */
-        if (log->l_reserve_headq) {
+        if (!list_empty_careful(&log->l_reserveq)) {
-                xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                spin_lock(&log->l_grant_reserve_lock);
+                /* recheck the queue now we are locked */
+                if (list_empty(&log->l_reserveq)) {
+                        spin_unlock(&log->l_grant_reserve_lock);
+                        goto redo;
+                }
+                list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep1(log, tic);
@@ -2575,72 +2537,57 @@ xlog_grant_log_space(xlog_t	   *log,
                        goto error_return;
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
                 */
                trace_xfs_log_grant_wake1(log, tic);
-                spin_lock(&log->l_grant_lock);
        }
-        if (tic->t_flags & XFS_LOG_PERM_RESERV)
-                need_bytes = tic->t_unit_res*tic->t_ocnt;
-        else
-                need_bytes = tic->t_unit_res;
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                                     log->l_grant_reserve_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_reserve_lock);
-                        xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                if (list_empty(&tic->t_queue))
+                        list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep2(log, tic);
-                spin_unlock(&log->l_grant_lock);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                trace_xfs_log_grant_wake2(log, tic);
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+                trace_xfs_log_grant_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space(log, need_bytes);
+                spin_lock(&log->l_grant_reserve_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_reserve_lock);
-        /*
-         * Check to make sure the grant write head didn't just over lap the
-         * tail.  If the cycles are the same, we can't be overlapping.
-         * Otherwise, make sure that the cycles differ by exactly one and
-         * check the byte count.
-         */
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_grant_exit(log, tic);
-        xlog_verify_grant_head(log, 1);
+        xlog_verify_grant_tail(log);
-        spin_unlock(&log->l_grant_lock);
        return 0;
- error_return:
+error_return_unlocked:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        spin_lock(&log->l_grant_reserve_lock);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+error_return:
+        list_del_init(&tic->t_queue);
+        spin_unlock(&log->l_grant_reserve_lock);
        trace_xfs_log_grant_error(log, tic);
        /*
@@ -2650,7 +2597,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_grant_log_space */
@@ -2658,17 +2604,14 @@ redo:
 /*
 * Replenish the byte reservation required by moving the grant write head.
 *
- *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
 */
 STATIC int
 xlog_regrant_write_log_space(xlog_t        *log,
                             xlog_ticket_t *tic)
 {
        int             free_bytes, need_bytes;
-        xlog_ticket_t   *ntic;
-#ifdef DEBUG
-        xfs_lsn_t       tail_lsn;
-#endif
        tic->t_curr_res = tic->t_unit_res;
        xlog_tic_reset_res(tic);
@@ -2681,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                panic("regrant Recovery problem");
 #endif
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_regrant_write_enter(log, tic);
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
        /* If there are other waiters on the queue then give them a
         * chance at logspace before us. Wake up the first waiters,
@@ -2695,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t	   *log,
         * this transaction.
         */
        need_bytes = tic->t_unit_res;
-        if ((ntic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
-                free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+                struct xlog_ticket *ntic;
-                                             log->l_grant_write_bytes);
-                do {
+                spin_lock(&log->l_grant_write_lock);
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+                list_for_each_entry(ntic, &log->l_writeq, t_queue) {
                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                        sv_signal(&ntic->t_wait);
+                        wake_up(&ntic->t_wait);
-                        ntic = ntic->t_next;
+                }
-                } while (ntic != log->l_write_headq);
-                if (ntic != log->l_write_headq) {
-                        if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-                                xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (ntic != list_first_entry(&log->l_writeq,
+                                                struct xlog_ticket, t_queue)) {
+                        if (list_empty(&tic->t_queue))
+                                list_add_tail(&tic->t_queue, &log->l_writeq);
                        trace_xfs_log_regrant_write_sleep1(log, tic);
-                        spin_unlock(&log->l_grant_lock);
+                        xlog_grant_push_ail(log, need_bytes);
-                        xlog_grant_push_ail(log->l_mp, need_bytes);
-                        spin_lock(&log->l_grant_lock);
                        XFS_STATS_INC(xs_sleep_logspace);
-                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
+                        xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                                &log->l_grant_lock, s);
-                        /* If we're shutting down, this tic is already
-                         * off the queue */
-                        spin_lock(&log->l_grant_lock);
-                        if (XLOG_FORCED_SHUTDOWN(log))
-                                goto error_return;
                        trace_xfs_log_regrant_write_wake1(log, tic);
-                }
+                } else
+                        spin_unlock(&log->l_grant_write_lock);
        }
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                                     log->l_grant_write_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_write_lock);
-                        xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (list_empty(&tic->t_queue))
-                spin_unlock(&log->l_grant_lock);
+                        list_add_tail(&tic->t_queue, &log->l_writeq);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_regrant_write_sleep2(log, tic);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                /* If we're shutting down, this tic is already off the queue */
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_regrant_write_sleep2(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
                trace_xfs_log_regrant_write_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_write_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space_write(log, need_bytes);
+                spin_lock(&log->l_grant_write_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_write_lock);
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_regrant_write_exit(log, tic);
+        xlog_verify_grant_tail(log);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        return 0;
+ error_return_unlocked:
+        spin_lock(&log->l_grant_write_lock);
 error_return:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        list_del_init(&tic->t_queue);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+        spin_unlock(&log->l_grant_write_lock);
        trace_xfs_log_regrant_write_error(log, tic);
        /*
@@ -2790,7 +2714,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_regrant_write_log_space */
@@ -2811,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head,
-        xlog_grant_sub_space(log, ticket->t_curr_res);
+                                        ticket->t_curr_res);
+        xlog_grant_sub_space(log, &log->l_grant_write_head,
+                                        ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
        trace_xfs_log_regrant_reserve_sub(log, ticket);
-        xlog_verify_grant_head(log, 1);
        /* just return if we still have some of the pre-reserved space */
-        if (ticket->t_cnt > 0) {
+        if (ticket->t_cnt > 0)
-                spin_unlock(&log->l_grant_lock);
                return;
-        }
-        xlog_grant_add_space_reserve(log, ticket->t_unit_res);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head,
+                                        ticket->t_unit_res);
        trace_xfs_log_regrant_reserve_exit(log, ticket);
-        xlog_verify_grant_head(log, 0);
-        spin_unlock(&log->l_grant_lock);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
 }       /* xlog_regrant_reserve_log_space */
@@ -2855,28 +2775,29 @@ STATIC void
 xlog_ungrant_log_space(xlog_t        *log,
                       xlog_ticket_t *ticket)
 {
+        int     bytes;
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_ungrant_enter(log, ticket);
-        xlog_grant_sub_space(log, ticket->t_curr_res);
        trace_xfs_log_ungrant_sub(log, ticket);
-        /* If this is a permanent reservation ticket, we may be able to free
+        /*
+         * If this is a permanent reservation ticket, we may be able to free
         * up more space based on the remaining count.
         */
+        bytes = ticket->t_curr_res;
        if (ticket->t_cnt > 0) {
                ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-                xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
+                bytes += ticket->t_unit_res*ticket->t_cnt;
        }
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+        xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
        trace_xfs_log_ungrant_exit(log, ticket);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        xfs_log_move_tail(log->l_mp, 1);
 }       /* xlog_ungrant_log_space */
@@ -2913,11 +2834,11 @@ xlog_state_release_iclog(
        if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
                /* update tail before writing to iclog */
-                xlog_assign_tail_lsn(log->l_mp);
+                xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
                sync++;
                iclog->ic_state = XLOG_STATE_SYNCING;
-                iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
+                iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
-                xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+                xlog_verify_tail_lsn(log, iclog, tail_lsn);
                /* cycle incremented when incrementing curr_block */
        }
        spin_unlock(&log->l_icloglock);
@@ -3100,7 +3021,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
+                xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3218,8 +3139,8 @@ try_again:
                                XFS_STATS_INC(xs_log_force_sleep);
-                                sv_wait(&iclog->ic_prev->ic_write_wait,
+                                xlog_wait(&iclog->ic_prev->ic_write_wait,
-                                        PSWP, &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                                if (log_flushed)
                                        *log_flushed = 1;
                                already_slept = 1;
@@ -3247,7 +3168,7 @@ try_again:
                                return XFS_ERROR(EIO);
                        }
                        XFS_STATS_INC(xs_log_force_sleep);
-                        sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+                        xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                        /*
                         * No need to grab the log lock here since we're
                         * only deciding whether or not to return EIO
@@ -3322,10 +3243,8 @@ xfs_log_ticket_put(
        xlog_ticket_t   *ticket)
 {
        ASSERT(atomic_read(&ticket->t_ref) > 0);
-        if (atomic_dec_and_test(&ticket->t_ref)) {
+        if (atomic_dec_and_test(&ticket->t_ref))
-                sv_destroy(&ticket->t_wait);
                kmem_zone_free(xfs_log_ticket_zone, ticket);
-        }
 }
 xlog_ticket_t *
@@ -3447,6 +3366,7 @@ xlog_ticket_alloc(
        }
        atomic_set(&tic->t_ref, 1);
+        INIT_LIST_HEAD(&tic->t_queue);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3457,7 +3377,7 @@ xlog_ticket_alloc(
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
+        init_waitqueue_head(&tic->t_wait);
        xlog_tic_reset_res(tic);
@@ -3496,18 +3416,25 @@ xlog_verify_dest_ptr(
 }
 STATIC void
-xlog_verify_grant_head(xlog_t *log, int equals)
+xlog_verify_grant_tail(
+        struct log      *log)
 {
-    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+        int             tail_cycle, tail_blocks;
-        if (equals)
+        int             cycle, space;
-            ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
-        else
+        /*
-            ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+         * Check to make sure the grant write head didn't just over lap the
-    } else {
+         * tail.  If the cycles are the same, we can't be overlapping.
-        ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+         * Otherwise, make sure that the cycles differ by exactly one and
-        ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+         * check the byte count.
-    }
+         */
-}       /* xlog_verify_grant_head */
+        xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+        if (tail_cycle != cycle) {
+                ASSERT(cycle - 1 == tail_cycle);
+                ASSERT(space <= BBTOB(tail_blocks));
+        }
+}
 /* check if it will fit */
 STATIC void
@@ -3728,12 +3655,10 @@ xfs_log_force_umount(
                xlog_cil_force(log);
        /*
-         * We must hold both the GRANT lock and the LOG lock,
+         * mark the filesystem and the as in a shutdown state and wake
-         * before we mark the filesystem SHUTDOWN and wake
+         * everybody up to tell them the bad news.
-         * everybody up to tell the bad news.
         */
        spin_lock(&log->l_icloglock);
-        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        if (mp->m_sb_bp)
                XFS_BUF_DONE(mp->m_sb_bp);
@@ -3754,27 +3679,21 @@ xfs_log_force_umount(
        spin_unlock(&log->l_icloglock);
        /*
-         * We don't want anybody waiting for log reservations
+         * We don't want anybody waiting for log reservations after this. That
-         * after this. That means we have to wake up everybody
+         * means we have to wake up everybody queued up on reserveq as well as
-         * queued up on reserve_headq as well as write_headq.
+         * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
-         * In addition, we make sure in xlog_{re}grant_log_space
+         * we don't enqueue anything once the SHUTDOWN flag is set, and this
-         * that we don't enqueue anything once the SHUTDOWN flag
+         * action is protected by the grant locks.
-         * is set, and this action is protected by the GRANTLOCK.
         */
-        if ((tic = log->l_reserve_headq)) {
+        spin_lock(&log->l_grant_reserve_lock);
-                do {
+        list_for_each_entry(tic, &log->l_reserveq, t_queue)
-                        sv_signal(&tic->t_wait);
+                wake_up(&tic->t_wait);
-                        tic = tic->t_next;
+        spin_unlock(&log->l_grant_reserve_lock);
-                } while (tic != log->l_reserve_headq);
-        }
+        spin_lock(&log->l_grant_write_lock);
+        list_for_each_entry(tic, &log->l_writeq, t_queue)
-        if ((tic = log->l_write_headq)) {
+                wake_up(&tic->t_wait);
-                do {
+        spin_unlock(&log->l_grant_write_lock);
-                        sv_signal(&tic->t_wait);
-                        tic = tic->t_next;
-                } while (tic != log->l_write_headq);
-        }
-        spin_unlock(&log->l_grant_lock);
        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..3bd3291ef8d2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
 xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
-int     xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+void    xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                struct xfs_log_vec *log_vector,
                                xfs_lsn_t *commit_lsn, int flags);
 bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7e206fc1fa36..9ca59be08977 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
        INIT_LIST_HEAD(&cil->xc_committing);
        spin_lock_init(&cil->xc_cil_lock);
        init_rwsem(&cil->xc_ctx_lock);
-        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        init_waitqueue_head(&cil->xc_commit_wait);
        INIT_LIST_HEAD(&ctx->committing);
        INIT_LIST_HEAD(&ctx->busy_extents);
@@ -146,102 +146,6 @@ xlog_cil_init_post_recovery(
 }
 /*
- * Insert the log item into the CIL and calculate the difference in space
- * consumed by the item. Add the space to the checkpoint ticket and calculate
- * if the change requires additional log metadata. If it does, take that space
- * as well. Remove the amount of space we addded to the checkpoint ticket from
- * the current transaction ticket so that the accounting works out correctly.
- *
- * If this is the first time the item is being placed into the CIL in this
- * context, pin it so it can't be written to disk until the CIL is flushed to
- * the iclog and the iclog written to disk.
- */
-static void
-xlog_cil_insert(
-        struct log              *log,
-        struct xlog_ticket      *ticket,
-        struct xfs_log_item     *item,
-        struct xfs_log_vec      *lv)
-{
-        struct xfs_cil          *cil = log->l_cilp;
-        struct xfs_log_vec      *old = lv->lv_item->li_lv;
-        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
-        int                     len;
-        int                     diff_iovecs;
-        int                     iclog_space;
-        if (old) {
-                /* existing lv on log item, space used is a delta */
-                ASSERT(!list_empty(&item->li_cil));
-                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
-                len = lv->lv_buf_len - old->lv_buf_len;
-                diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
-                kmem_free(old->lv_buf);
-                kmem_free(old);
-        } else {
-                /* new lv, must pin the log item */
-                ASSERT(!lv->lv_item->li_lv);
-                ASSERT(list_empty(&item->li_cil));
-                len = lv->lv_buf_len;
-                diff_iovecs = lv->lv_niovecs;
-                IOP_PIN(lv->lv_item);
-        }
-        len += diff_iovecs * sizeof(xlog_op_header_t);
-        /* attach new log vector to log item */
-        lv->lv_item->li_lv = lv;
-        spin_lock(&cil->xc_cil_lock);
-        list_move_tail(&item->li_cil, &cil->xc_cil);
-        ctx->nvecs += diff_iovecs;
-        /*
-         * If this is the first time the item is being committed to the CIL,
-         * store the sequence number on the log item so we can tell
-         * in future commits whether this is the first checkpoint the item is
-         * being committed into.
-         */
-        if (!item->li_seq)
-                item->li_seq = ctx->sequence;
-        /*
-         * Now transfer enough transaction reservation to the context ticket
-         * for the checkpoint. The context ticket is special - the unit
-         * reservation has to grow as well as the current reservation as we
-         * steal from tickets so we can correctly determine the space used
-         * during the transaction commit.
-         */
-        if (ctx->ticket->t_curr_res == 0) {
-                /* first commit in checkpoint, steal the header reservation */
-                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
-                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
-                ticket->t_curr_res -= ctx->ticket->t_unit_res;
-        }
-        /* do we need space for more log record headers? */
-        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
-        if (len > 0 && (ctx->space_used / iclog_space !=
-                                (ctx->space_used + len) / iclog_space)) {
-                int hdrs;
-                hdrs = (len + iclog_space - 1) / iclog_space;
-                /* need to take into account split region headers, too */
-                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
-                ctx->ticket->t_unit_res += hdrs;
-                ctx->ticket->t_curr_res += hdrs;
-                ticket->t_curr_res -= hdrs;
-                ASSERT(ticket->t_curr_res >= len);
-        }
-        ticket->t_curr_res -= len;
-        ctx->space_used += len;
-        spin_unlock(&cil->xc_cil_lock);
-}
-/*
 * Format log item into a flat buffers
 *
 * For delayed logging, we need to hold a formatted buffer containing all the
@@ -286,7 +190,7 @@ xlog_cil_format_items(
                        len += lv->lv_iovecp[index].i_len;
                lv->lv_buf_len = len;
-                lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+                lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
                ptr = lv->lv_buf;
                for (index = 0; index < lv->lv_niovecs; index++) {
@@ -300,21 +204,136 @@ xlog_cil_format_items(
        }
 }
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+        struct log              *log,
+        struct xfs_log_vec      *lv,
+        int                     *len,
+        int                     *diff_iovecs)
+{
+        struct xfs_log_vec      *old = lv->lv_item->li_lv;
+        if (old) {
+                /* existing lv on log item, space used is a delta */
+                ASSERT(!list_empty(&lv->lv_item->li_cil));
+                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+                *len += lv->lv_buf_len - old->lv_buf_len;
+                *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
+                kmem_free(old->lv_buf);
+                kmem_free(old);
+        } else {
+                /* new lv, must pin the log item */
+                ASSERT(!lv->lv_item->li_lv);
+                ASSERT(list_empty(&lv->lv_item->li_cil));
+                *len += lv->lv_buf_len;
+                *diff_iovecs += lv->lv_niovecs;
+                IOP_PIN(lv->lv_item);
+        }
+        /* attach new log vector to log item */
+        lv->lv_item->li_lv = lv;
+        /*
+         * If this is the first time the item is being committed to the
+         * CIL, store the sequence number on the log item so we can
+         * tell in future commits whether this is the first checkpoint
+         * the item is being committed into.
+         */
+        if (!lv->lv_item->li_seq)
+                lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+/*
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ */
 static void
 xlog_cil_insert_items(
        struct log              *log,
        struct xfs_log_vec      *log_vector,
-        struct xlog_ticket      *ticket,
+        struct xlog_ticket      *ticket)
-        xfs_lsn_t               *start_lsn)
 {
-        struct xfs_log_vec *lv;
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
-        if (start_lsn)
+        struct xfs_log_vec      *lv;
-                *start_lsn = log->l_cilp->xc_ctx->sequence;
+        int                     len = 0;
+        int                     diff_iovecs = 0;
+        int                     iclog_space;
        ASSERT(log_vector);
+        /*
+         * Do all the accounting aggregation and switching of log vectors
+         * around in a separate loop to the insertion of items into the CIL.
+         * Then we can do a separate loop to update the CIL within a single
+         * lock/unlock pair. This reduces the number of round trips on the CIL
+         * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
+         * hold time for the transaction commit.
+         *
+         * If this is the first time the item is being placed into the CIL in
+         * this context, pin it so it can't be written to disk until the CIL is
+         * flushed to the iclog and the iclog written to disk.
+         *
+         * We can do this safely because the context can't checkpoint until we
+         * are done so it doesn't matter exactly how we update the CIL.
+         */
+        for (lv = log_vector; lv; lv = lv->lv_next)
+                xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+        /* account for space used by new iovec headers  */
+        len += diff_iovecs * sizeof(xlog_op_header_t);
+        spin_lock(&cil->xc_cil_lock);
+        /* move the items to the tail of the CIL */
        for (lv = log_vector; lv; lv = lv->lv_next)
-                xlog_cil_insert(log, ticket, lv->lv_item, lv);
+                list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+        ctx->nvecs += diff_iovecs;
+        /*
+         * Now transfer enough transaction reservation to the context ticket
+         * for the checkpoint. The context ticket is special - the unit
+         * reservation has to grow as well as the current reservation as we
+         * steal from tickets so we can correctly determine the space used
+         * during the transaction commit.
+         */
+        if (ctx->ticket->t_curr_res == 0) {
+                /* first commit in checkpoint, steal the header reservation */
+                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+                ticket->t_curr_res -= ctx->ticket->t_unit_res;
+        }
+        /* do we need space for more log record headers? */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        if (len > 0 && (ctx->space_used / iclog_space !=
+                                (ctx->space_used + len) / iclog_space)) {
+                int hdrs;
+                hdrs = (len + iclog_space - 1) / iclog_space;
+                /* need to take into account split region headers, too */
+                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+                ctx->ticket->t_unit_res += hdrs;
+                ctx->ticket->t_curr_res += hdrs;
+                ticket->t_curr_res -= hdrs;
+                ASSERT(ticket->t_curr_res >= len);
+        }
+        ticket->t_curr_res -= len;
+        ctx->space_used += len;
+        spin_unlock(&cil->xc_cil_lock);
 }
 static void
@@ -342,15 +361,10 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-        struct xfs_log_vec      *lv;
-        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
        struct xfs_busy_extent  *busyp, *n;
-        /* unpin all the log items */
+        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
-        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                                        ctx->start_lsn, abort);
-                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
-                                                        abortflag);
-        }
        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -529,7 +543,7 @@ xlog_cil_push(
        error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
        if (error)
-                goto out_abort;
+                goto out_abort_free_ticket;
        /*
         * now that we've written the checkpoint into the log, strictly
@@ -549,14 +563,15 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
        }
        spin_unlock(&cil->xc_cil_lock);
+        /* xfs_log_done always frees the ticket on error. */
        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
-        if (error || commit_lsn == -1)
+        if (commit_lsn == -1)
                goto out_abort;
        /* attach all the transactions w/ busy extents to iclog */
@@ -573,7 +588,7 @@ restart:
         */
        spin_lock(&cil->xc_cil_lock);
        ctx->commit_lsn = commit_lsn;
-        sv_broadcast(&cil->xc_commit_wait);
+        wake_up_all(&cil->xc_commit_wait);
        spin_unlock(&cil->xc_cil_lock);
        /* release the hounds! */
@@ -586,6 +601,8 @@ out_free_ticket:
        kmem_free(new_ctx);
        return 0;
+out_abort_free_ticket:
+        xfs_log_ticket_put(tic);
 out_abort:
        xlog_cil_committed(ctx, XFS_LI_ABORTED);
        return XFS_ERROR(EIO);
@@ -608,7 +625,7 @@ out_abort:
 * background commit, returns without it held once background commits are
 * allowed again.
 */
-int
+void
 xfs_log_commit_cil(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
@@ -623,11 +640,6 @@ xfs_log_commit_cil(
        if (flags & XFS_TRANS_RELEASE_LOG_RES)
                log_flags = XFS_LOG_REL_PERM_RESERV;
-        if (XLOG_FORCED_SHUTDOWN(log)) {
-                xlog_cil_free_logvec(log_vector);
-                return XFS_ERROR(EIO);
-        }
        /*
         * do all the hard work of formatting items (including memory
         * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -638,7 +650,10 @@ xfs_log_commit_cil(
        /* lock out background commit */
        down_read(&log->l_cilp->xc_ctx_lock);
-        xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+        if (commit_lsn)
+                *commit_lsn = log->l_cilp->xc_ctx->sequence;
+        xlog_cil_insert_items(log, log_vector, tp->t_ticket);
        /* check we didn't blow the reservation */
        if (tp->t_ticket->t_curr_res < 0)
@@ -684,7 +699,6 @@ xfs_log_commit_cil(
         */
        if (push)
                xlog_cil_push(log, 0);
-        return 0;
 }
 /*
@@ -735,7 +749,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
                if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617f..d5f8be8f4bf6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
 struct xfs_buf;
 struct log;
 struct xlog_ticket;
-struct xfs_buf_cancel;
 struct xfs_mount;
 /*
@@ -54,7 +53,6 @@ struct xfs_mount;
        BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
 {
        return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
 */
 #define XLOG_TIC_INITED         0x1     /* has been initialized */
 #define XLOG_TIC_PERM_RESERV    0x2     /* permanent reservation */
-#define XLOG_TIC_IN_Q           0x4
 #define XLOG_TIC_FLAGS \
        { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
-        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
+        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
-        { XLOG_TIC_IN_Q,        "XLOG_TIC_IN_Q" }
 #endif  /* __KERNEL__ */
@@ -244,9 +240,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        sv_t               t_wait;       /* ticket wait queue            : 20 */
+        wait_queue_head_t  t_wait;       /* ticket wait queue */
-        struct xlog_ticket *t_next;      /*                              :4|8 */
+        struct list_head   t_queue;      /* reserve/write queue */
-        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
 * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_in_core {
-        sv_t                    ic_force_wait;
+        wait_queue_head_t       ic_force_wait;
-        sv_t                    ic_write_wait;
+        wait_queue_head_t       ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
        struct xfs_cil_ctx      *xc_ctx;
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
-        sv_t                    xc_commit_wait;
+        wait_queue_head_t       xc_commit_wait;
        xfs_lsn_t               xc_current_sequence;
 };
@@ -491,7 +486,7 @@ typedef struct log {
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
        uint                    l_flags;
        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-        struct xfs_buf_cancel   **l_buf_cancel_table;
+        struct list_head        *l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
        int                     l_logBBsize;    /* size of log in BB chunks */
        /* The following block of fields are changed while holding icloglock */
-        sv_t                    l_flush_wait ____cacheline_aligned_in_smp;
+        wait_queue_head_t       l_flush_wait ____cacheline_aligned_in_smp;
                                                /* waiting for iclog flush */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
        spinlock_t              l_icloglock;    /* grab to change iclog state */
-        xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
-                                                 * buffers */
-        xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
        int                     l_curr_cycle;   /* Cycle number of log writes */
        int                     l_prev_cycle;   /* Cycle number before last
                                                 * block increment */
        int                     l_curr_block;   /* current logical log block */
        int                     l_prev_block;   /* previous logical log block */
-        /* The following block of fields are changed while holding grant_lock */
+        /*
-        spinlock_t              l_grant_lock ____cacheline_aligned_in_smp;
+         * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
-        xlog_ticket_t           *l_reserve_headq;
+         * read without needing to hold specific locks. To avoid operations
-        xlog_ticket_t           *l_write_headq;
+         * contending with other hot objects, place each of them on a separate
-        int                     l_grant_reserve_cycle;
+         * cacheline.
-        int                     l_grant_reserve_bytes;
+         */
-        int                     l_grant_write_cycle;
+        /* lsn of last LR on disk */
-        int                     l_grant_write_bytes;
+        atomic64_t              l_last_sync_lsn ____cacheline_aligned_in_smp;
+        /* lsn of 1st LR with unflushed * buffers */
+        atomic64_t              l_tail_lsn ____cacheline_aligned_in_smp;
+        /*
+         * ticket grant locks, queues and accounting have their own cachlines
+         * as these are quite hot and can be operated on concurrently.
+         */
+        spinlock_t              l_grant_reserve_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_reserveq;
+        atomic64_t              l_grant_reserve_head;
+        spinlock_t              l_grant_write_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_writeq;
+        atomic64_t              l_grant_write_head;
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
 } xlog_t;
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+        ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
@@ -562,6 +571,61 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
                                xlog_in_core_t **commit_iclog, uint flags);
 /*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to smaple and crack LSNs taht are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+        xfs_lsn_t val = atomic64_read(lsn);
+        *cycle = CYCLE_LSN(val);
+        *block = BLOCK_LSN(val);
+}
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+        atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+        *cycle = val >> 32;
+        *space = val & 0xffffffff;
+}
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+        xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+        return ((int64_t)cycle << 32) | space;
+}
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+        atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+/*
 * Committed Item List interfaces
 */
 int     xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
 */
 #define XLOG_UNMOUNT_REC_TYPE   (-1U)
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue_exclusive(wq, &wait);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        spin_unlock(lock);
+        schedule();
+        remove_wait_queue(wq, &wait);
+}
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6f3f5fa37acf..aa0ebb776903 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #endif
 /*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+        xfs_daddr_t             bc_blkno;
+        uint                    bc_len;
+        int                     bc_refcount;
+        struct list_head        bc_list;
+};
+/*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
@@ -107,7 +118,8 @@ xlog_get_bp(
                nbblks += log->l_sectBBsize;
        nbblks = round_up(nbblks, log->l_sectBBsize);
-        return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
+        return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
+                                        BBTOB(nbblks), 0);
 }
 STATIC void
@@ -167,7 +179,7 @@ xlog_bread_noalign(
        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
        xfsbdstrat(log->l_mp, bp);
-        error = xfs_iowait(bp);
+        error = xfs_buf_iowait(bp);
        if (error)
                xfs_ioerror_alert("xlog_bread", log->l_mp,
                                  bp, XFS_BUF_ADDR(bp));
@@ -321,12 +333,13 @@ xlog_recover_iodone(
                 * this during recovery. One strike!
                 */
                xfs_ioerror_alert("xlog_recover_iodone",
-                                  bp->b_mount, bp, XFS_BUF_ADDR(bp));
+                                        bp->b_target->bt_mount, bp,
-                xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
+                                        XFS_BUF_ADDR(bp));
+                xfs_force_shutdown(bp->b_target->bt_mount,
+                                        SHUTDOWN_META_IO_ERROR);
        }
-        bp->b_mount = NULL;
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        xfs_biodone(bp);
+        xfs_buf_ioend(bp, 0);
 }
 /*
@@ -923,12 +936,12 @@ xlog_find_tail(
        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
        if (found == 2)
                log->l_curr_cycle++;
-        log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
+        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-        log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
+        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-        log->l_grant_reserve_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
-        log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
-        log->l_grant_write_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
-        log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
        /*
         * Look for unmount record.  If we find it, then we know there
@@ -958,7 +971,7 @@ xlog_find_tail(
        }
        after_umount_blk = (i + hblks + (int)
                BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-        tail_lsn = log->l_tail_lsn;
+        tail_lsn = atomic64_read(&log->l_tail_lsn);
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -973,12 +986,10 @@ xlog_find_tail(
                         * log records will point recovery to after the
                         * current unmount record.
                         */
-                        log->l_tail_lsn =
+                        xlog_assign_atomic_lsn(&log->l_tail_lsn,
-                                xlog_assign_lsn(log->l_curr_cycle,
+                                        log->l_curr_cycle, after_umount_blk);
-                                                after_umount_blk);
+                        xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-                        log->l_last_sync_lsn =
+                                        log->l_curr_cycle, after_umount_blk);
-                                xlog_assign_lsn(log->l_curr_cycle,
-                                                after_umount_blk);
                        *tail_blk = after_umount_blk;
                        /*
@@ -1603,82 +1614,45 @@ xlog_recover_reorder_trans(
 * record in the table to tell us how many times we expect to see this
 * record during the second pass.
 */
-STATIC void
+STATIC int
-xlog_recover_do_buffer_pass1(
+xlog_recover_buffer_pass1(
-        xlog_t                  *log,
+        struct log              *log,
-        xfs_buf_log_format_t    *buf_f)
+        xlog_recover_item_t     *item)
 {
-        xfs_buf_cancel_t        *bcp;
+        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_buf_cancel_t        *nextp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
-        xfs_daddr_t             blkno = 0;
-        uint                    len = 0;
-        ushort                  flags = 0;
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        }
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLF_CANCEL)) {
+        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
-                return;
+                return 0;
-        }
-        /*
-         * Insert an xfs_buf_cancel record into the hash table of
-         * them.  If there is already an identical record, bump
-         * its reference count.
-         */
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        /*
-         * If the hash bucket is empty then just insert a new record into
-         * the bucket.
-         */
-        if (*bucket == NULL) {
-                bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-                                                     KM_SLEEP);
-                bcp->bc_blkno = blkno;
-                bcp->bc_len = len;
-                bcp->bc_refcount = 1;
-                bcp->bc_next = NULL;
-                *bucket = bcp;
-                return;
        }
        /*
-         * The hash bucket is not empty, so search for duplicates of our
+         * Insert an xfs_buf_cancel record into the hash table of them.
-         * record.  If we find one them just bump its refcount.  If not
+         * If there is already an identical record, bump its reference count.
-         * then add us at the end of the list.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
-        nextp = *bucket;
+        list_for_each_entry(bcp, bucket, bc_list) {
-        while (nextp != NULL) {
+                if (bcp->bc_blkno == buf_f->blf_blkno &&
-                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+                    bcp->bc_len == buf_f->blf_len) {
-                        nextp->bc_refcount++;
+                        bcp->bc_refcount++;
                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
-                        return;
+                        return 0;
                }
-                prevp = nextp;
+        }
-                nextp = nextp->bc_next;
-        }
+        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
-        ASSERT(prevp != NULL);
+        bcp->bc_blkno = buf_f->blf_blkno;
-        bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+        bcp->bc_len = buf_f->blf_len;
-                                             KM_SLEEP);
-        bcp->bc_blkno = blkno;
-        bcp->bc_len = len;
        bcp->bc_refcount = 1;
-        bcp->bc_next = NULL;
+        list_add_tail(&bcp->bc_list, bucket);
-        prevp->bc_next = bcp;
        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+        return 0;
 }
 /*
@@ -1696,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
 */
 STATIC int
 xlog_check_buffer_cancelled(
-        xlog_t                  *log,
+        struct log              *log,
        xfs_daddr_t             blkno,
        uint                    len,
        ushort                  flags)
 {
-        xfs_buf_cancel_t        *bcp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
        if (log->l_buf_cancel_table == NULL) {
                /*
@@ -1714,128 +1687,70 @@ xlog_check_buffer_cancelled(
                return 0;
        }
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        bcp = *bucket;
-        if (bcp == NULL) {
-                /*
-                 * There is no corresponding entry in the table built
-                 * in pass one, so this buffer has not been cancelled.
-                 */
-                ASSERT(!(flags & XFS_BLF_CANCEL));
-                return 0;
-        }
        /*
-         * Search for an entry in the buffer cancel table that
+         * Search for an entry in the  cancel table that matches our buffer.
-         * matches our buffer.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
-        while (bcp != NULL) {
+        list_for_each_entry(bcp, bucket, bc_list) {
-                if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-                        /*
+                        goto found;
-                         * We've go a match, so return 1 so that the
-                         * recovery of this buffer is cancelled.
-                         * If this buffer is actually a buffer cancel
-                         * log item, then decrement the refcount on the
-                         * one in the table and remove it if this is the
-                         * last reference.
-                         */
-                        if (flags & XFS_BLF_CANCEL) {
-                                bcp->bc_refcount--;
-                                if (bcp->bc_refcount == 0) {
-                                        if (prevp == NULL) {
-                                                *bucket = bcp->bc_next;
-                                        } else {
-                                                prevp->bc_next = bcp->bc_next;
-                                        }
-                                        kmem_free(bcp);
-                                }
-                        }
-                        return 1;
-                }
-                prevp = bcp;
-                bcp = bcp->bc_next;
        }
        /*
-         * We didn't find a corresponding entry in the table, so
+         * We didn't find a corresponding entry in the table, so return 0 so
-         * return 0 so that the buffer is NOT cancelled.
+         * that the buffer is NOT cancelled.
         */
        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
-}
-STATIC int
+found:
-xlog_recover_do_buffer_pass2(
+        /*
-        xlog_t                  *log,
+         * We've go a match, so return 1 so that the recovery of this buffer
-        xfs_buf_log_format_t    *buf_f)
+         * is cancelled.  If this buffer is actually a buffer cancel log
-{
+         * item, then decrement the refcount on the one in the table and
-        xfs_daddr_t             blkno = 0;
+         * remove it if this is the last reference.
-        ushort                  flags = 0;
+         */
-        uint                    len = 0;
+        if (flags & XFS_BLF_CANCEL) {
+                if (--bcp->bc_refcount == 0) {
-        switch (buf_f->blf_type) {
+                        list_del(&bcp->bc_list);
-        case XFS_LI_BUF:
+                        kmem_free(bcp);
-                blkno = buf_f->blf_blkno;
+                }
-                flags = buf_f->blf_flags;
-                len = buf_f->blf_len;
-                break;
        }
+        return 1;
-        return xlog_check_buffer_cancelled(log, blkno, len, flags);
 }
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers,
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
- * the only data which should be recovered is that which corresponds
+ * data which should be recovered is that which corresponds to the
- * to the di_next_unlinked pointers in the on disk inode structures.
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
- * The rest of the data for the inodes is always logged through the
+ * data for the inodes is always logged through the inodes themselves rather
- * inodes themselves rather than the inode buffer and is recovered
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
- * in xlog_recover_do_inode_trans().
 *
- * The only time when buffers full of inodes are fully recovered is
+ * The only time when buffers full of inodes are fully recovered is when the
- * when the buffer is full of newly allocated inodes.  In this case
+ * buffer is full of newly allocated inodes.  In this case the buffer will
- * the buffer will not be marked as an inode buffer and so will be
+ * not be marked as an inode buffer and so will be sent to
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * xlog_recover_do_reg_buffer() below during recovery.
 */
 STATIC int
 xlog_recover_do_inode_buffer(
-        xfs_mount_t             *mp,
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
-        int                     item_index;
+        int                     item_index = 0;
-        int                     bit;
+        int                     bit = 0;
-        int                     nbits;
+        int                     nbits = 0;
-        int                     reg_buf_offset;
+        int                     reg_buf_offset = 0;
-        int                     reg_buf_bytes;
+        int                     reg_buf_bytes = 0;
        int                     next_unlinked_offset;
        int                     inodes_per_buf;
        xfs_agino_t             *logged_nextp;
        xfs_agino_t             *buffer_nextp;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
-        /*
-         * Set the variables corresponding to the current region to
-         * 0 so that we'll initialize them on the first pass through
-         * the loop.
-         */
-        reg_buf_offset = 0;
-        reg_buf_bytes = 0;
-        bit = 0;
-        nbits = 0;
-        item_index = 0;
        inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
        for (i = 0; i < inodes_per_buf; i++) {
                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1850,18 +1765,18 @@ xlog_recover_do_inode_buffer(
                         * the current di_next_unlinked field.
                         */
                        bit += nbits;
-                        bit = xfs_next_bit(data_map, map_size, bit);
+                        bit = xfs_next_bit(buf_f->blf_data_map,
+                                           buf_f->blf_map_size, bit);
                        /*
                         * If there are no more logged regions in the
                         * buffer, then we're done.
                         */
-                        if (bit == -1) {
+                        if (bit == -1)
                                return 0;
-                        }
-                        nbits = xfs_contig_bits(data_map, map_size,
+                        nbits = xfs_contig_bits(buf_f->blf_data_map,
-                                                         bit);
+                                                buf_f->blf_map_size, bit);
                        ASSERT(nbits > 0);
                        reg_buf_offset = bit << XFS_BLF_SHIFT;
                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1873,9 +1788,8 @@ xlog_recover_do_inode_buffer(
                 * di_next_unlinked field, then move on to the next
                 * di_next_unlinked field.
                 */
-                if (next_unlinked_offset < reg_buf_offset) {
+                if (next_unlinked_offset < reg_buf_offset)
                        continue;
-                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1911,36 +1825,29 @@ xlog_recover_do_inode_buffer(
 * given buffer.  The bitmap in the buf log format structure indicates
 * where to place the logged data.
 */
-/*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
        int                     bit;
        int                     nbits;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        int                     error;
        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
        bit = 0;
        i = 1;  /* 0 is the buf format structure */
        while (1) {
-                bit = xfs_next_bit(data_map, map_size, bit);
+                bit = xfs_next_bit(buf_f->blf_data_map,
+                                   buf_f->blf_map_size, bit);
                if (bit == -1)
                        break;
-                nbits = xfs_contig_bits(data_map, map_size, bit);
+                nbits = xfs_contig_bits(buf_f->blf_data_map,
+                                        buf_f->blf_map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2174,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
 * for more details on the implementation of the table of cancel records.
 */
 STATIC int
-xlog_recover_do_buffer_trans(
+xlog_recover_buffer_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        int                     error;
-        int                     cancel;
-        xfs_daddr_t             blkno;
-        int                     len;
-        ushort                  flags;
        uint                    buf_flags;
-        if (pass == XLOG_RECOVER_PASS1) {
+        /*
-                /*
+         * In this pass we only want to recover all the buffers which have
-                 * In this pass we're only looking for buf items
+         * not been cancelled and are not cancellation buffers themselves.
-                 * with the XFS_BLF_CANCEL bit set.
+         */
-                 */
+        if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
-                xlog_recover_do_buffer_pass1(log, buf_f);
+                        buf_f->blf_len, buf_f->blf_flags)) {
+                trace_xfs_log_recover_buf_cancel(log, buf_f);
                return 0;
-        } else {
-                /*
-                 * In this pass we want to recover all the buffers
-                 * which have not been cancelled and are not
-                 * cancellation buffers themselves.  The routine
-                 * we call here will tell us whether or not to
-                 * continue with the replay of this buffer.
-                 */
-                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
-                if (cancel) {
-                        trace_xfs_log_recover_buf_cancel(log, buf_f);
-                        return 0;
-                }
        }
        trace_xfs_log_recover_buf_recover(log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        default:
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
-                        "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
-                        buf_f->blf_type, log->l_mp->m_logname ?
-                        log->l_mp->m_logname : "internal");
-                XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
-                                 XFS_ERRLEVEL_LOW, log->l_mp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
-        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLF_INODE_BUF))
+        if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
-        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
+        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+                          buf_flags);
        if (XFS_BUF_ISERROR(bp)) {
-                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+                xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
-                                  bp, blkno);
+                                  bp, buf_f->blf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                return error;
        }
        error = 0;
-        if (flags & XFS_BLF_INODE_BUF) {
+        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-        } else if (flags &
+        } else if (buf_f->blf_flags &
                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
@@ -2275,8 +2151,7 @@ xlog_recover_do_buffer_trans(
                XFS_BUF_STALE(bp);
                error = xfs_bwrite(mp, bp);
        } else {
-                ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+                ASSERT(bp->b_target->bt_mount == mp);
-                bp->b_mount = mp;
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        }
@@ -2285,16 +2160,14 @@ xlog_recover_do_buffer_trans(
 }
 STATIC int
-xlog_recover_do_inode_trans(
+xlog_recover_inode_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_inode_log_format_t  *in_f;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        xfs_dinode_t            *dip;
-        xfs_ino_t               ino;
        int                     len;
        xfs_caddr_t             src;
        xfs_caddr_t             dest;
@@ -2304,10 +2177,6 @@ xlog_recover_do_inode_trans(
        xfs_icdinode_t          *dicp;
        int                     need_free = 0;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
                in_f = item->ri_buf[0].i_addr;
        } else {
@@ -2317,8 +2186,6 @@ xlog_recover_do_inode_trans(
                if (error)
                        goto error;
        }
-        ino = in_f->ilf_ino;
-        mp = log->l_mp;
        /*
         * Inode buffers can be freed, look out for it,
@@ -2353,8 +2220,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
-                        dip, bp, ino);
+                        dip, bp, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2364,8 +2231,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
-                        item, ino);
+                        item, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2393,12 +2260,12 @@ xlog_recover_do_inode_trans(
        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                                item, dip, bp, ino);
+                                item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2406,40 +2273,40 @@ xlog_recover_do_inode_trans(
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                                item, dip, bp, ino);
+                                item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
-                        item, dip, bp, ino,
+                        item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
-                        item, dip, bp, ino, dicp->di_forkoff);
+                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2531,7 +2398,7 @@ xlog_recover_do_inode_trans(
                        break;
                default:
-                        xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+                        xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
                        ASSERT(0);
                        xfs_buf_relse(bp);
                        error = EIO;
@@ -2540,8 +2407,7 @@ xlog_recover_do_inode_trans(
        }
 write_inode_buffer:
-        ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+        ASSERT(bp->b_target->bt_mount == mp);
-        bp->b_mount = mp;
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
 error:
@@ -2556,18 +2422,11 @@ error:
 * of that type.
 */
 STATIC int
-xlog_recover_do_quotaoff_trans(
+xlog_recover_quotaoff_pass1(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_qoff_logformat_t    *qoff_f;
+        xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
-        if (pass == XLOG_RECOVER_PASS2) {
-                return (0);
-        }
-        qoff_f = item->ri_buf[0].i_addr;
        ASSERT(qoff_f);
        /*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
 * Recover a dquot record
 */
 STATIC int
-xlog_recover_do_dquot_trans(
+xlog_recover_dquot_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        struct xfs_disk_dquot   *ddq, *recddq;
        int                     error;
        xfs_dq_logformat_t      *dq_f;
        uint                    type;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
-        mp = log->l_mp;
        /*
         * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
        if ((error = xfs_qm_dqcheck(recddq,
                           dq_f->qlf_id,
                           0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans (log copy)"))) {
+                           "xlog_recover_dquot_pass2 (log copy)"))) {
                return XFS_ERROR(EIO);
        }
        ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
         * minimal initialization then.
         */
        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans")) {
+                           "xlog_recover_dquot_pass2")) {
                xfs_buf_relse(bp);
                return XFS_ERROR(EIO);
        }
@@ -2678,8 +2532,7 @@ xlog_recover_do_dquot_trans(
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
        ASSERT(dq_f->qlf_size == 2);
-        ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+        ASSERT(bp->b_target->bt_mount == mp);
-        bp->b_mount = mp;
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
@@ -2694,38 +2547,31 @@ xlog_recover_do_dquot_trans(
 * LSN.
 */
 STATIC int
-xlog_recover_do_efi_trans(
+xlog_recover_efi_pass2(
        xlog_t                  *log,
        xlog_recover_item_t     *item,
-        xfs_lsn_t               lsn,
+        xfs_lsn_t               lsn)
-        int                     pass)
 {
        int                     error;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_efi_log_item_t      *efip;
        xfs_efi_log_format_t    *efi_formatp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        efi_formatp = item->ri_buf[0].i_addr;
-        mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
                                         &(efip->efi_format)))) {
                xfs_efi_item_free(efip);
                return error;
        }
-        efip->efi_next_extent = efi_formatp->efi_nextents;
+        atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
-        efip->efi_flags |= XFS_EFI_COMMITTED;
        spin_lock(&log->l_ailp->xa_lock);
        /*
         * xfs_trans_ail_update() drops the AIL lock.
         */
-        xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+        xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
        return 0;
 }
@@ -2738,11 +2584,10 @@ xlog_recover_do_efi_trans(
 * efd format structure.  If we find it, we remove the efi from the
 * AIL and free it.
 */
-STATIC void
+STATIC int
-xlog_recover_do_efd_trans(
+xlog_recover_efd_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
@@ -2751,10 +2596,6 @@ xlog_recover_do_efd_trans(
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp = log->l_ailp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return;
-        }
        efd_formatp = item->ri_buf[0].i_addr;
        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2786,62 +2627,6 @@ xlog_recover_do_efd_trans(
        }
        xfs_trans_ail_cursor_done(ailp, &cur);
        spin_unlock(&ailp->xa_lock);
-}
-/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now.  Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
-STATIC int
-xlog_recover_do_trans(
-        xlog_t                  *log,
-        xlog_recover_t          *trans,
-        int                     pass)
-{
-        int                     error = 0;
-        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(log, trans, pass);
-        if (error)
-                return error;
-        list_for_each_entry(item, &trans->r_itemq, ri_list) {
-                trace_xfs_log_recover_item_recover(log, trans, item, pass);
-                switch (ITEM_TYPE(item)) {
-                case XFS_LI_BUF:
-                        error = xlog_recover_do_buffer_trans(log, item, pass);
-                        break;
-                case XFS_LI_INODE:
-                        error = xlog_recover_do_inode_trans(log, item, pass);
-                        break;
-                case XFS_LI_EFI:
-                        error = xlog_recover_do_efi_trans(log, item,
-                                                          trans->r_lsn, pass);
-                        break;
-                case XFS_LI_EFD:
-                        xlog_recover_do_efd_trans(log, item, pass);
-                        error = 0;
-                        break;
-                case XFS_LI_DQUOT:
-                        error = xlog_recover_do_dquot_trans(log, item, pass);
-                        break;
-                case XFS_LI_QUOTAOFF:
-                        error = xlog_recover_do_quotaoff_trans(log, item,
-                                                               pass);
-                        break;
-                default:
-                        xlog_warn(
-        "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
-                        ASSERT(0);
-                        error = XFS_ERROR(EIO);
-                        break;
-                }
-                if (error)
-                        return error;
-        }
        return 0;
 }
@@ -2853,7 +2638,7 @@ xlog_recover_do_trans(
 */
 STATIC void
 xlog_recover_free_trans(
-        xlog_recover_t          *trans)
+        struct xlog_recover     *trans)
 {
        xlog_recover_item_t     *item, *n;
        int                     i;
@@ -2872,17 +2657,95 @@ xlog_recover_free_trans(
 }
 STATIC int
+xlog_recover_commit_pass1(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass1(log, item);
+        case XFS_LI_QUOTAOFF:
+                return xlog_recover_quotaoff_pass1(log, item);
+        case XFS_LI_INODE:
+        case XFS_LI_EFI:
+        case XFS_LI_EFD:
+        case XFS_LI_DQUOT:
+                /* nothing to do in pass 1 */
+                return 0;
+        default:
+                xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_commit_pass1",
+                        ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+STATIC int
+xlog_recover_commit_pass2(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass2(log, item);
+        case XFS_LI_INODE:
+                return xlog_recover_inode_pass2(log, item);
+        case XFS_LI_EFI:
+                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+        case XFS_LI_EFD:
+                return xlog_recover_efd_pass2(log, item);
+        case XFS_LI_DQUOT:
+                return xlog_recover_dquot_pass2(log, item);
+        case XFS_LI_QUOTAOFF:
+                /* nothing to do in pass2 */
+                return 0;
+        default:
+                xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_commit_pass2",
+                        ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
 xlog_recover_commit_trans(
-        xlog_t                  *log,
+        struct log              *log,
-        xlog_recover_t          *trans,
+        struct xlog_recover     *trans,
        int                     pass)
 {
-        int                     error;
+        int                     error = 0;
+        xlog_recover_item_t     *item;
        hlist_del(&trans->r_list);
-        if ((error = xlog_recover_do_trans(log, trans, pass)))
+        error = xlog_recover_reorder_trans(log, trans, pass);
+        if (error)
                return error;
-        xlog_recover_free_trans(trans);                 /* no error */
+        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                if (pass == XLOG_RECOVER_PASS1)
+                        error = xlog_recover_commit_pass1(log, trans, item);
+                else
+                        error = xlog_recover_commit_pass2(log, trans, item);
+                if (error)
+                        return error;
+        }
+        xlog_recover_free_trans(trans);
        return 0;
 }
@@ -3012,7 +2875,7 @@ xlog_recover_process_efi(
        xfs_extent_t            *extp;
        xfs_fsblock_t           startblock_fsb;
-        ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+        ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
        /*
         * First check the validity of the extents described by the
@@ -3051,7 +2914,7 @@ xlog_recover_process_efi(
                                         extp->ext_len);
        }
-        efip->efi_flags |= XFS_EFI_RECOVERED;
+        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
        error = xfs_trans_commit(tp, 0);
        return error;
@@ -3108,7 +2971,7 @@ xlog_recover_process_efis(
                 * Skip EFIs that we've already processed.
                 */
                efip = (xfs_efi_log_item_t *)lip;
-                if (efip->efi_flags & XFS_EFI_RECOVERED) {
+                if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
@@ -3725,7 +3588,7 @@ xlog_do_log_recovery(
        xfs_daddr_t     head_blk,
        xfs_daddr_t     tail_blk)
 {
-        int             error;
+        int             error, i;
        ASSERT(head_blk != tail_blk);
@@ -3733,10 +3596,12 @@ xlog_do_log_recovery(
         * First do a pass to find all of the cancelled buf log items.
         * Store them in the buf_cancel_table for use in the second pass.
         */
-        log->l_buf_cancel_table =
+        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
-                (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+                                                 sizeof(struct list_head),
-                                                 sizeof(xfs_buf_cancel_t*),
                                                 KM_SLEEP);
+        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
                                      XLOG_RECOVER_PASS1);
        if (error != 0) {
@@ -3755,7 +3620,7 @@ xlog_do_log_recovery(
                int     i;
                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-                        ASSERT(log->l_buf_cancel_table[i] == NULL);
+                        ASSERT(list_empty(&log->l_buf_cancel_table[i]));
        }
 #endif  /* DEBUG */
@@ -3817,7 +3682,7 @@ xlog_do_recover(
        XFS_BUF_READ(bp);
        XFS_BUF_UNASYNC(bp);
        xfsbdstrat(log->l_mp, bp);
-        error = xfs_iowait(bp);
+        error = xfs_buf_iowait(bp);
        if (error) {
                xfs_ioerror_alert("xlog_do_recover",
                                  log->l_mp, bp, XFS_BUF_ADDR(bp));
@@ -3935,7 +3800,7 @@ xlog_recover_finish(
                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
        } else {
                cmn_err(CE_DEBUG,
-                        "!Ending clean XFS mount for filesystem: %s\n",
+                        "Ending clean XFS mount for filesystem: %s\n",
                        log->l_mp->m_fsname);
        }
        return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aeb9d72ebf6e..d447aef84bc3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -52,16 +52,11 @@ STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
                                                int);
 STATIC void     xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
                                                int);
-STATIC int      xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
-                                                int64_t, int);
 STATIC void     xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 #else
 #define xfs_icsb_balance_counter(mp, a, b)              do { } while (0)
 #define xfs_icsb_balance_counter_locked(mp, a, b)       do { } while (0)
-#define xfs_icsb_modify_counters(mp, a, b, c)           do { } while (0)
 #endif
 static const struct {
@@ -199,6 +194,8 @@ xfs_uuid_unmount(
 /*
 * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
 */
 struct xfs_perag *
 xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -206,19 +203,43 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
        struct xfs_perag        *pag;
        int                     ref = 0;
-        spin_lock(&mp->m_perag_lock);
+        rcu_read_lock();
        pag = radix_tree_lookup(&mp->m_perag_tree, agno);
        if (pag) {
                ASSERT(atomic_read(&pag->pag_ref) >= 0);
-                /* catch leaks in the positive direction during testing */
-                ASSERT(atomic_read(&pag->pag_ref) < 1000);
                ref = atomic_inc_return(&pag->pag_ref);
        }
-        spin_unlock(&mp->m_perag_lock);
+        rcu_read_unlock();
        trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
        return pag;
 }
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          first,
+        int                     tag)
+{
+        struct xfs_perag        *pag;
+        int                     found;
+        int                     ref;
+        rcu_read_lock();
+        found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+                                        (void **)&pag, first, 1, tag);
+        if (found <= 0) {
+                rcu_read_unlock();
+                return NULL;
+        }
+        ref = atomic_inc_return(&pag->pag_ref);
+        rcu_read_unlock();
+        trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+        return pag;
+}
 void
 xfs_perag_put(struct xfs_perag *pag)
 {
@@ -229,10 +250,18 @@ xfs_perag_put(struct xfs_perag *pag)
        trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
 }
+STATIC void
+__xfs_free_perag(
+        struct rcu_head *head)
+{
+        struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+        ASSERT(atomic_read(&pag->pag_ref) == 0);
+        kmem_free(pag);
+}
 /*
- * Free up the resources associated with a mount structure.  Assume that
+ * Free up the per-ag resources associated with the mount structure.
- * the structure was initially zeroed, so we can tell which fields got
- * initialized.
 */
 STATIC void
 xfs_free_perag(
@@ -244,10 +273,10 @@ xfs_free_perag(
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
                spin_lock(&mp->m_perag_lock);
                pag = radix_tree_delete(&mp->m_perag_tree, agno);
+                spin_unlock(&mp->m_perag_lock);
                ASSERT(pag);
                ASSERT(atomic_read(&pag->pag_ref) == 0);
-                spin_unlock(&mp->m_perag_lock);
+                call_rcu(&pag->rcu_head, __xfs_free_perag);
-                kmem_free(pag);
        }
 }
@@ -443,8 +472,11 @@ xfs_initialize_perag(
                        goto out_unwind;
                pag->pag_agno = index;
                pag->pag_mount = mp;
-                rwlock_init(&pag->pag_ici_lock);
+                spin_lock_init(&pag->pag_ici_lock);
+                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+                spin_lock_init(&pag->pag_buf_lock);
+                pag->pag_buf_tree = RB_ROOT;
                if (radix_tree_preload(GFP_NOFS))
                        goto out_unwind;
@@ -639,7 +671,6 @@ int
 xfs_readsb(xfs_mount_t *mp, int flags)
 {
        unsigned int    sector_size;
-        unsigned int    extra_flags;
        xfs_buf_t       *bp;
        int             error;
@@ -652,28 +683,24 @@ xfs_readsb(xfs_mount_t *mp, int flags)
         * access to the superblock.
         */
        sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-        extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
-        bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
+reread:
-                          extra_flags);
+        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
-        if (!bp || XFS_BUF_ISERROR(bp)) {
+                                        XFS_SB_DADDR, sector_size, 0);
-                xfs_fs_mount_cmn_err(flags, "SB read failed");
+        if (!bp) {
-                error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
+                xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
-                goto fail;
+                return EIO;
        }
-        ASSERT(XFS_BUF_ISBUSY(bp));
-        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
        /*
         * Initialize the mount structure from the superblock.
         * But first do some basic consistency checking.
         */
        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
        error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
        if (error) {
                xfs_fs_mount_cmn_err(flags, "SB validate failed");
-                goto fail;
+                goto release_buf;
        }
        /*
@@ -684,7 +711,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
                        "device supports only %u byte sectors (not %u)",
                        sector_size, mp->m_sb.sb_sectsize);
                error = ENOSYS;
-                goto fail;
+                goto release_buf;
        }
        /*
@@ -692,33 +719,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
         * re-read the superblock so the buffer is correctly sized.
         */
        if (sector_size < mp->m_sb.sb_sectsize) {
-                XFS_BUF_UNMANAGE(bp);
                xfs_buf_relse(bp);
                sector_size = mp->m_sb.sb_sectsize;
-                bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR,
+                goto reread;
-                                  BTOBB(sector_size), extra_flags);
-                if (!bp || XFS_BUF_ISERROR(bp)) {
-                        xfs_fs_mount_cmn_err(flags, "SB re-read failed");
-                        error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
-                        goto fail;
-                }
-                ASSERT(XFS_BUF_ISBUSY(bp));
-                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
        }
        /* Initialize per-cpu counters */
        xfs_icsb_reinit_counters(mp);
        mp->m_sb_bp = bp;
-        xfs_buf_relse(bp);
+        xfs_buf_unlock(bp);
-        ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
        return 0;
- fail:
+release_buf:
-        if (bp) {
+        xfs_buf_relse(bp);
-                XFS_BUF_UNMANAGE(bp);
-                xfs_buf_relse(bp);
-        }
        return error;
 }
@@ -961,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 }
 /*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+        struct xfs_mount        *mp)
+{
+        int i;
+        for (i = 0; i < XFS_LOWSP_MAX; i++) {
+                __uint64_t space = mp->m_sb.sb_dblocks;
+                do_div(space, 100);
+                mp->m_low_space[i] = space * (i + 1);
+        }
+}
+/*
 * Set whether we're using inode alignment.
 */
 STATIC void
@@ -991,42 +1023,35 @@ xfs_check_sizes(xfs_mount_t *mp)
 {
        xfs_buf_t       *bp;
        xfs_daddr_t     d;
-        int             error;
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
-                cmn_err(CE_WARN, "XFS: size check 1 failed");
+                cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
                return XFS_ERROR(EFBIG);
        }
-        error = xfs_read_buf(mp, mp->m_ddev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
-                             d - XFS_FSS_TO_BB(mp, 1),
+                                        d - XFS_FSS_TO_BB(mp, 1),
-                             XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                                        BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
-        if (!error) {
+        if (!bp) {
-                xfs_buf_relse(bp);
+                cmn_err(CE_WARN, "XFS: last sector read failed");
-        } else {
+                return EIO;
-                cmn_err(CE_WARN, "XFS: size check 2 failed");
-                if (error == ENOSPC)
-                        error = XFS_ERROR(EFBIG);
-                return error;
        }
+        xfs_buf_relse(bp);
        if (mp->m_logdev_targp != mp->m_ddev_targp) {
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-                        cmn_err(CE_WARN, "XFS: size check 3 failed");
+                        cmn_err(CE_WARN, "XFS: log size mismatch detected");
                        return XFS_ERROR(EFBIG);
                }
-                error = xfs_read_buf(mp, mp->m_logdev_targp,
+                bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
-                                     d - XFS_FSB_TO_BB(mp, 1),
+                                        d - XFS_FSB_TO_BB(mp, 1),
-                                     XFS_FSB_TO_BB(mp, 1), 0, &bp);
+                                        XFS_FSB_TO_B(mp, 1), 0);
-                if (!error) {
+                if (!bp) {
-                        xfs_buf_relse(bp);
+                        cmn_err(CE_WARN, "XFS: log device read failed");
-                } else {
+                        return EIO;
-                        cmn_err(CE_WARN, "XFS: size check 3 failed");
-                        if (error == ENOSPC)
-                                error = XFS_ERROR(EFBIG);
-                        return error;
                }
+                xfs_buf_relse(bp);
        }
        return 0;
 }
@@ -1189,6 +1214,9 @@ xfs_mountfs(
         */
        xfs_set_rw_sizes(mp);
+        /* set the low space thresholds for dynamic preallocation */
+        xfs_set_low_space_thresholds(mp);
        /*
         * Set the inode cluster size.
         * This may still be overridden by the file system
@@ -1601,7 +1629,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                XFS_BUF_UNASYNC(sbp);
                ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
                xfsbdstrat(mp, sbp);
-                error = xfs_iowait(sbp);
+                error = xfs_buf_iowait(sbp);
                if (error)
                        xfs_ioerror_alert("xfs_unmountfs_writesb",
                                          mp, sbp, XFS_BUF_ADDR(sbp));
@@ -1832,135 +1860,72 @@ xfs_mod_incore_sb_unlocked(
 */
 int
 xfs_mod_incore_sb(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        xfs_sb_field_t  field,
+        xfs_sb_field_t          field,
-        int64_t         delta,
+        int64_t                 delta,
-        int             rsvd)
+        int                     rsvd)
 {
-        int     status;
+        int                     status;
-        /* check for per-cpu counters */
-        switch (field) {
 #ifdef HAVE_PERCPU_SB
-        case XFS_SBS_ICOUNT:
+        ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
-        case XFS_SBS_IFREE:
-        case XFS_SBS_FDBLOCKS:
-                if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                        status = xfs_icsb_modify_counters(mp, field,
-                                                        delta, rsvd);
-                        break;
-                }
-                /* FALLTHROUGH */
 #endif
-        default:
+        spin_lock(&mp->m_sb_lock);
-                spin_lock(&mp->m_sb_lock);
+        status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-                status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+        spin_unlock(&mp->m_sb_lock);
-                spin_unlock(&mp->m_sb_lock);
-                break;
-        }
        return status;
 }
 /*
- * xfs_mod_incore_sb_batch() is used to change more than one field
+ * Change more than one field in the in-core superblock structure at a time.
- * in the in-core superblock structure at a time.  This modification
+ *
- * is protected by a lock internal to this module.  The fields and
+ * The fields and changes to those fields are specified in the array of
- * changes to those fields are specified in the array of xfs_mod_sb
+ * xfs_mod_sb structures passed in.  Either all of the specified deltas
- * structures passed in.
+ * will be applied or none of them will.  If any modified field dips below 0,
+ * then all modifications will be backed out and EINVAL will be returned.
 *
- * Either all of the specified deltas will be applied or none of
+ * Note that this function may not be used for the superblock values that
- * them will.  If any modified field dips below 0, then all modifications
+ * are tracked with the in-memory per-cpu counters - a direct call to
- * will be backed out and EINVAL will be returned.
+ * xfs_icsb_modify_counters is required for these.
 */
 int
-xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
+xfs_mod_incore_sb_batch(
+        struct xfs_mount        *mp,
+        xfs_mod_sb_t            *msb,
+        uint                    nmsb,
+        int                     rsvd)
 {
-        int             status=0;
+        xfs_mod_sb_t            *msbp = &msb[0];
-        xfs_mod_sb_t    *msbp;
+        int                     error = 0;
        /*
-         * Loop through the array of mod structures and apply each
+         * Loop through the array of mod structures and apply each individually.
-         * individually.  If any fail, then back out all those
+         * If any fail, then back out all those which have already been applied.
-         * which have already been applied.  Do all of this within
+         * Do all of this within the scope of the m_sb_lock so that all of the
-         * the scope of the m_sb_lock so that all of the changes will
+         * changes will be atomic.
-         * be atomic.
         */
        spin_lock(&mp->m_sb_lock);
-        msbp = &msb[0];
        for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
-                /*
+                ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
-                 * Apply the delta at index n.  If it fails, break
+                       msbp->msb_field > XFS_SBS_FDBLOCKS);
-                 * from the loop so we'll fall into the undo loop
-                 * below.
-                 */
-                switch (msbp->msb_field) {
-#ifdef HAVE_PERCPU_SB
-                case XFS_SBS_ICOUNT:
-                case XFS_SBS_IFREE:
-                case XFS_SBS_FDBLOCKS:
-                        if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                                spin_unlock(&mp->m_sb_lock);
-                                status = xfs_icsb_modify_counters(mp,
-                                                        msbp->msb_field,
-                                                        msbp->msb_delta, rsvd);
-                                spin_lock(&mp->m_sb_lock);
-                                break;
-                        }
-                        /* FALLTHROUGH */
-#endif
-                default:
-                        status = xfs_mod_incore_sb_unlocked(mp,
-                                                msbp->msb_field,
-                                                msbp->msb_delta, rsvd);
-                        break;
-                }
-                if (status != 0) {
+                error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
-                        break;
+                                                   msbp->msb_delta, rsvd);
-                }
+                if (error)
+                        goto unwind;
        }
+        spin_unlock(&mp->m_sb_lock);
+        return 0;
-        /*
+unwind:
-         * If we didn't complete the loop above, then back out
+        while (--msbp >= msb) {
-         * any changes made to the superblock.  If you add code
+                error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
-         * between the loop above and here, make sure that you
+                                                   -msbp->msb_delta, rsvd);
-         * preserve the value of status. Loop back until
+                ASSERT(error == 0);
-         * we step below the beginning of the array.  Make sure
-         * we don't touch anything back there.
-         */
-        if (status != 0) {
-                msbp--;
-                while (msbp >= msb) {
-                        switch (msbp->msb_field) {
-#ifdef HAVE_PERCPU_SB
-                        case XFS_SBS_ICOUNT:
-                        case XFS_SBS_IFREE:
-                        case XFS_SBS_FDBLOCKS:
-                                if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                                        spin_unlock(&mp->m_sb_lock);
-                                        status = xfs_icsb_modify_counters(mp,
-                                                        msbp->msb_field,
-                                                        -(msbp->msb_delta),
-                                                        rsvd);
-                                        spin_lock(&mp->m_sb_lock);
-                                        break;
-                                }
-                                /* FALLTHROUGH */
-#endif
-                        default:
-                                status = xfs_mod_incore_sb_unlocked(mp,
-                                                        msbp->msb_field,
-                                                        -(msbp->msb_delta),
-                                                        rsvd);
-                                break;
-                        }
-                        ASSERT(status == 0);
-                        msbp--;
-                }
        }
        spin_unlock(&mp->m_sb_lock);
-        return status;
+        return error;
 }
 /*
@@ -1998,18 +1963,13 @@ xfs_getsb(
 */
 void
 xfs_freesb(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf          *bp = mp->m_sb_bp;
-        /*
+        xfs_buf_lock(bp);
-         * Use xfs_getsb() so that the buffer will be locked
-         * when we call xfs_buf_relse().
-         */
-        bp = xfs_getsb(mp, 0);
-        XFS_BUF_UNMANAGE(bp);
-        xfs_buf_relse(bp);
        mp->m_sb_bp = NULL;
+        xfs_buf_relse(bp);
 }
 /*
@@ -2496,7 +2456,7 @@ xfs_icsb_balance_counter(
        spin_unlock(&mp->m_sb_lock);
 }
-STATIC int
+int
 xfs_icsb_modify_counters(
        xfs_mount_t     *mp,
        xfs_sb_field_t  field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 622da2179a57..a62e8971539d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations {
 #include "xfs_sync.h"
-struct cred;
 struct log;
 struct xfs_mount_args;
 struct xfs_inode;
@@ -91,6 +90,8 @@ extern void	xfs_icsb_reinit_counters(struct xfs_mount *);
 extern void     xfs_icsb_destroy_counters(struct xfs_mount *);
 extern void     xfs_icsb_sync_counters(struct xfs_mount *, int);
 extern void     xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
+extern int      xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
+                                                int64_t, int);
 #else
 #define xfs_icsb_init_counters(mp)              (0)
@@ -98,8 +99,20 @@ extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #define xfs_icsb_reinit_counters(mp)            do { } while (0)
 #define xfs_icsb_sync_counters(mp, flags)       do { } while (0)
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
+#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
+        xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+        XFS_LOWSP_1_PCNT = 0,
+        XFS_LOWSP_2_PCNT,
+        XFS_LOWSP_3_PCNT,
+        XFS_LOWSP_4_PCNT,
+        XFS_LOWSP_5_PCNT,
+        XFS_LOWSP_MAX,
+};
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -199,6 +212,8 @@ typedef struct xfs_mount {
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
+        int64_t                 m_low_space[XFS_LOWSP_MAX];
+                                                /* low free space thresholds */
 } xfs_mount_t;
 /*
@@ -232,8 +247,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_DIRSYNC       (1ULL << 21)    /* synchronous directory ops */
 #define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22)    /* don't report large preferred
                                                 * I/O size in stat() */
-#define XFS_MOUNT_NO_PERCPU_SB  (1ULL << 23)    /* don't use per-cpu superblock
-                                                   counters */
 #define XFS_MOUNT_FILESTREAMS   (1ULL << 24)    /* enable the filestreams
                                                   allocator */
 #define XFS_MOUNT_NOATTR2       (1ULL << 25)    /* disable use of attr2 format */
@@ -327,6 +340,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 * perag get/put wrappers for ref counting
 */
 struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+                                        int tag);
 void    xfs_perag_put(struct xfs_perag *pag);
 /*
@@ -376,6 +391,8 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
+extern void     xfs_set_low_space_thresholds(struct xfs_mount *);
 #endif  /* __KERNEL__ */
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2b..edfa178bafb6 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
        spin_lock(&mru->lock);
        if (mru->queued) {
                spin_unlock(&mru->lock);
-                cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
+                cancel_delayed_work_sync(&mru->work);
                spin_lock(&mru->lock);
        }
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd6..9bb6eda4cd21 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
 #define xfs_trans_apply_dquot_deltas(tp)
 #define xfs_trans_unreserve_and_mod_dquots(tp)
-#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)      (0)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
-#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)      (0)
+                struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+        return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+                struct xfs_mount *mp, struct xfs_dquot *udqp,
+                struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+{
+        return 0;
+}
 #define xfs_qm_vop_create_dqattach(tp, ip, u, g)
 #define xfs_qm_vop_rename_dqattach(it)                                  (0)
 #define xfs_qm_vop_chown(tp, ip, old, new)                              (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_qm_dqdetach(ip)
 #define xfs_qm_dqrele(d)
 #define xfs_qm_statvfs(ip, s)
-#define xfs_qm_sync(mp, fl)                                             (0)
+static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
+{
+        return 0;
+}
 #define xfs_qm_newmount(mp, a, b)                                       (0)
 #define xfs_qm_mount_quotas(mp)
 #define xfs_qm_unmount(mp)
-#define xfs_qm_unmount_quotas(mp)                                       (0)
+#define xfs_qm_unmount_quotas(mp)
 #endif /* CONFIG_XFS_QUOTA */
 #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
deleted file mode 100644
index 2dec79edb510..000000000000
--- a/fs/xfs/xfs_refcache.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_REFCACHE_H__
-#define __XFS_REFCACHE_H__
-#ifdef HAVE_REFCACHE
-/*
- * Maximum size (in inodes) for the NFS reference cache
- */
-#define XFS_REFCACHE_SIZE_MAX   512
-struct xfs_inode;
-struct xfs_mount;
-extern void xfs_refcache_insert(struct xfs_inode *);
-extern void xfs_refcache_purge_ip(struct xfs_inode *);
-extern void xfs_refcache_purge_mp(struct xfs_mount *);
-extern void xfs_refcache_purge_some(struct xfs_mount *);
-extern void xfs_refcache_resize(int);
-extern void xfs_refcache_destroy(void);
-extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
-#else
-#define xfs_refcache_insert(ip)         do { } while (0)
-#define xfs_refcache_purge_ip(ip)       do { } while (0)
-#define xfs_refcache_purge_mp(mp)       do { } while (0)
-#define xfs_refcache_purge_some(mp)     do { } while (0)
-#define xfs_refcache_resize(size)       do { } while (0)
-#define xfs_refcache_destroy()          do { } while (0)
-#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags)
-#endif
-#endif  /* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 8fca957200df..77a59891734e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -183,7 +183,7 @@ xfs_rename(
         * tree quota mechanism would be circumvented.
         */
        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+                     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
                error = XFS_ERROR(EXDEV);
                goto error_return;
        }
@@ -211,7 +211,9 @@ xfs_rename(
                        goto error_return;
                if (error)
                        goto abort_return;
-                xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                xfs_trans_ichgtime(tp, target_dp,
+                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                if (new_parent && src_is_directory) {
                        error = xfs_bumplink(tp, target_dp);
@@ -249,7 +251,9 @@ xfs_rename(
                                        &first_block, &free_list, spaceres);
                if (error)
                        goto abort_return;
-                xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                xfs_trans_ichgtime(tp, target_dp,
+                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                /*
                 * Decrement the link count on the target since the target
@@ -292,7 +296,8 @@ xfs_rename(
         * inode isn't really being changed, but old unix file systems did
         * it and some incremental backup programs won't work without it.
         */
-        xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
        /*
         * Adjust the link count on src_dp.  This is necessary when
@@ -315,7 +320,7 @@ xfs_rename(
        if (error)
                goto abort_return;
-        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
        if (new_parent)
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 891260fea11e..12a191385310 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -39,6 +39,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_buf.h"
 /*
@@ -1883,13 +1884,13 @@ xfs_growfs_rt(
        /*
         * Read in the last block of the device, make sure it exists.
         */
-        error = xfs_read_buf(mp, mp->m_rtdev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
-                        XFS_FSB_TO_BB(mp, nrblocks - 1),
+                                XFS_FSB_TO_BB(mp, nrblocks - 1),
-                        XFS_FSB_TO_BB(mp, 1), 0, &bp);
+                                XFS_FSB_TO_B(mp, 1), 0);
-        if (error)
+        if (!bp)
-                return error;
+                return EIO;
-        ASSERT(bp);
        xfs_buf_relse(bp);
        /*
         * Calculate new parameters.  These are the final values to be reached.
         */
@@ -2215,7 +2216,6 @@ xfs_rtmount_init(
 {
        xfs_buf_t       *bp;    /* buffer for last block of subvolume */
        xfs_daddr_t     d;      /* address of last block of subvolume */
-        int             error;  /* error return value */
        xfs_sb_t        *sbp;   /* filesystem superblock copy in mount */
        sbp = &mp->m_sb;
@@ -2242,15 +2242,12 @@ xfs_rtmount_init(
                        (unsigned long long) mp->m_sb.sb_rblocks);
                return XFS_ERROR(EFBIG);
        }
-        error = xfs_read_buf(mp, mp->m_rtdev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
-                                d - XFS_FSB_TO_BB(mp, 1),
+                                        d - XFS_FSB_TO_BB(mp, 1),
-                                XFS_FSB_TO_BB(mp, 1), 0, &bp);
+                                        XFS_FSB_TO_B(mp, 1), 0);
-        if (error) {
+        if (!bp) {
-                cmn_err(CE_WARN,
+                cmn_err(CE_WARN, "XFS: realtime device size check failed");
-        "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
+                return EIO;
-                if (error == ENOSPC)
-                        return XFS_ERROR(EFBIG);
-                return error;
        }
        xfs_buf_relse(bp);
        return 0;
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1b017c657494..1eb2ba586814 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -80,10 +80,12 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_RESERVED4BIT    0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
 #define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT     0x00000080      /* 32 bit project id */
 #define XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
-         XFS_SB_VERSION2_ATTR2BIT)
+         XFS_SB_VERSION2_ATTR2BIT       | \
+         XFS_SB_VERSION2_PROJID32BIT)
 #define XFS_SB_VERSION2_OKSASHFBITS     \
        (0)
 #define XFS_SB_VERSION2_OKREALBITS      \
@@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
                sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
 }
+static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
+{
+        return xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
+}
 /*
 * end of superblock version macros
 */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1c47edaea0d2..76922793f64f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -696,7 +696,7 @@ xfs_trans_reserve(
         * fail if the count would go below zero.
         */
        if (blocks > 0) {
-                error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
                                          -((int64_t)blocks), rsvd);
                if (error != 0) {
                        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -767,7 +767,7 @@ undo_log:
 undo_blocks:
        if (blocks > 0) {
-                (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
                                         (int64_t)blocks, rsvd);
                tp->t_blk_res = 0;
        }
@@ -1009,7 +1009,7 @@ void
 xfs_trans_unreserve_and_mod_sb(
        xfs_trans_t     *tp)
 {
-        xfs_mod_sb_t    msb[14];        /* If you add cases, add entries */
+        xfs_mod_sb_t    msb[9]; /* If you add cases, add entries */
        xfs_mod_sb_t    *msbp;
        xfs_mount_t     *mp = tp->t_mountp;
        /* REFERENCED */
@@ -1017,55 +1017,61 @@ xfs_trans_unreserve_and_mod_sb(
        int             rsvd;
        int64_t         blkdelta = 0;
        int64_t         rtxdelta = 0;
+        int64_t         idelta = 0;
+        int64_t         ifreedelta = 0;
        msbp = msb;
        rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
-        /* calculate free blocks delta */
+        /* calculate deltas */
        if (tp->t_blk_res > 0)
                blkdelta = tp->t_blk_res;
        if ((tp->t_fdblocks_delta != 0) &&
            (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
             (tp->t_flags & XFS_TRANS_SB_DIRTY)))
                blkdelta += tp->t_fdblocks_delta;
-        if (blkdelta != 0) {
-                msbp->msb_field = XFS_SBS_FDBLOCKS;
-                msbp->msb_delta = blkdelta;
-                msbp++;
-        }
-        /* calculate free realtime extents delta */
        if (tp->t_rtx_res > 0)
                rtxdelta = tp->t_rtx_res;
        if ((tp->t_frextents_delta != 0) &&
            (tp->t_flags & XFS_TRANS_SB_DIRTY))
                rtxdelta += tp->t_frextents_delta;
+        if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+             (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+                idelta = tp->t_icount_delta;
+                ifreedelta = tp->t_ifree_delta;
+        }
+        /* apply the per-cpu counters */
+        if (blkdelta) {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                                 blkdelta, rsvd);
+                if (error)
+                        goto out;
+        }
+        if (idelta) {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
+                                                 idelta, rsvd);
+                if (error)
+                        goto out_undo_fdblocks;
+        }
+        if (ifreedelta) {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
+                                                 ifreedelta, rsvd);
+                if (error)
+                        goto out_undo_icount;
+        }
+        /* apply remaining deltas */
        if (rtxdelta != 0) {
                msbp->msb_field = XFS_SBS_FREXTENTS;
                msbp->msb_delta = rtxdelta;
                msbp++;
        }
-        /* apply remaining deltas */
-        if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
-             (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
-                if (tp->t_icount_delta != 0) {
-                        msbp->msb_field = XFS_SBS_ICOUNT;
-                        msbp->msb_delta = tp->t_icount_delta;
-                        msbp++;
-                }
-                if (tp->t_ifree_delta != 0) {
-                        msbp->msb_field = XFS_SBS_IFREE;
-                        msbp->msb_delta = tp->t_ifree_delta;
-                        msbp++;
-                }
-        }
        if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
                if (tp->t_dblocks_delta != 0) {
                        msbp->msb_field = XFS_SBS_DBLOCKS;
@@ -1115,8 +1121,24 @@ xfs_trans_unreserve_and_mod_sb(
        if (msbp > msb) {
                error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
                        (uint)(msbp - msb), rsvd);
-                ASSERT(error == 0);
+                if (error)
+                        goto out_undo_ifreecount;
        }
+        return;
+out_undo_ifreecount:
+        if (ifreedelta)
+                xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+out_undo_icount:
+        if (idelta)
+                xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+out_undo_fdblocks:
+        if (blkdelta)
+                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+out:
+        ASSERT(error == 0);
+        return;
 }
 /*
@@ -1328,7 +1350,7 @@ xfs_trans_fill_vecs(
 * they could be immediately flushed and we'd have to race with the flusher
 * trying to pull the item from the AIL as we add it.
 */
-void
+static void
 xfs_trans_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               commit_lsn,
@@ -1389,15 +1411,12 @@ xfs_trans_item_committed(
 */
 STATIC void
 xfs_trans_committed(
-        struct xfs_trans        *tp,
+        void                    *arg,
        int                     abortflag)
 {
+        struct xfs_trans        *tp = arg;
        struct xfs_log_item_desc *lidp, *next;
-        /* Call the transaction's completion callback if there is one. */
-        if (tp->t_callback != NULL)
-                tp->t_callback(tp, tp->t_callarg);
        list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
                xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
                xfs_trans_free_item_desc(lidp);
@@ -1406,21 +1425,120 @@ xfs_trans_committed(
        xfs_trans_free(tp);
 }
+static inline void
+xfs_log_item_batch_insert(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     **log_items,
+        int                     nr_items,
+        xfs_lsn_t               commit_lsn)
+{
+        int     i;
+        spin_lock(&ailp->xa_lock);
+        /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+        xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+        for (i = 0; i < nr_items; i++)
+                IOP_UNPIN(log_items[i], 0);
+}
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is IOP_COMMITTED processing, followed by an
+ * IOP_UNPIN(aborted) call.
+ */
+void
+xfs_trans_committed_bulk(
+        struct xfs_ail          *ailp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               commit_lsn,
+        int                     aborted)
+{
+#define LOG_ITEM_BATCH_SIZE     32
+        struct xfs_log_item     *log_items[LOG_ITEM_BATCH_SIZE];
+        struct xfs_log_vec      *lv;
+        int                     i = 0;
+        /* unpin all the log items */
+        for (lv = log_vector; lv; lv = lv->lv_next ) {
+                struct xfs_log_item     *lip = lv->lv_item;
+                xfs_lsn_t               item_lsn;
+                if (aborted)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                item_lsn = IOP_COMMITTED(lip, commit_lsn);
+                /* item_lsn of -1 means the item was freed */
+                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                        continue;
+                /*
+                 * if we are aborting the operation, no point in inserting the
+                 * object into the AIL as we are in a shutdown situation.
+                 */
+                if (aborted) {
+                        ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+                        IOP_UNPIN(lip, 1);
+                        continue;
+                }
+                if (item_lsn != commit_lsn) {
+                        /*
+                         * Not a bulk update option due to unusual item_lsn.
+                         * Push into AIL immediately, rechecking the lsn once
+                         * we have the ail lock. Then unpin the item.
+                         */
+                        spin_lock(&ailp->xa_lock);
+                        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+                                xfs_trans_ail_update(ailp, lip, item_lsn);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                        IOP_UNPIN(lip, 0);
+                        continue;
+                }
+                /* Item is a candidate for bulk AIL insert.  */
+                log_items[i++] = lv->lv_item;
+                if (i >= LOG_ITEM_BATCH_SIZE) {
+                        xfs_log_item_batch_insert(ailp, log_items,
+                                        LOG_ITEM_BATCH_SIZE, commit_lsn);
+                        i = 0;
+                }
+        }
+        /* make sure we insert the remainder! */
+        if (i)
+                xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+}
 /*
- * Called from the trans_commit code when we notice that
+ * Called from the trans_commit code when we notice that the filesystem is in
- * the filesystem is in the middle of a forced shutdown.
+ * the middle of a forced shutdown.
+ *
+ * When we are called here, we have already pinned all the items in the
+ * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
+ * so we can simply walk the items in the transaction, unpin them with an abort
+ * flag and then free the items. Note that unpinning the items can result in
+ * them being freed immediately, so we need to use a safe list traversal method
+ * here.
 */
 STATIC void
 xfs_trans_uncommit(
        struct xfs_trans        *tp,
        uint                    flags)
 {
-        struct xfs_log_item_desc *lidp;
+        struct xfs_log_item_desc *lidp, *n;
-        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+        list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
-                /*
-                 * Unpin all but those that aren't dirty.
-                 */
                if (lidp->lid_flags & XFS_LID_DIRTY)
                        IOP_UNPIN(lidp->lid_item, 1);
        }
@@ -1525,7 +1643,7 @@ xfs_trans_commit_iclog(
         * running in simulation mode (the log is explicitly turned
         * off).
         */
-        tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
+        tp->t_logcb.cb_func = xfs_trans_committed;
        tp->t_logcb.cb_arg = tp;
        /*
@@ -1637,7 +1755,6 @@ xfs_trans_commit_cil(
        int                     flags)
 {
        struct xfs_log_vec      *log_vector;
-        int                     error;
        /*
         * Get each log item to allocate a vector structure for
@@ -1648,9 +1765,7 @@ xfs_trans_commit_cil(
        if (!log_vector)
                return ENOMEM;
-        error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+        xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-        if (error)
-                return error;
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c13c0f97b494..c2042b736b81 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 #define XFS_ALLOC_BTREE_REF     2
 #define XFS_BMAP_BTREE_REF      2
 #define XFS_DIR_BTREE_REF       2
+#define XFS_INO_REF             2
 #define XFS_ATTR_BTREE_REF      1
-#define XFS_INO_REF             1
 #define XFS_DQUOT_REF           1
 #ifdef __KERNEL__
@@ -399,8 +399,6 @@ typedef struct xfs_trans {
                                                 * transaction. */
        struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
        struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
-        xfs_trans_callback_t    t_callback;     /* transaction callback */
-        void                    *t_callarg;     /* callback arg */
        unsigned int            t_flags;        /* misc flags */
        int64_t                 t_icount_delta; /* superblock icount change */
        int64_t                 t_ifree_delta;  /* superblock ifree change */
@@ -473,6 +471,7 @@ void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 int             xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
                               xfs_ino_t , uint, uint, struct xfs_inode **);
+void            xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
 void            xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
 void            xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
 void            xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff7..c5bbbc45db91 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
 STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
 STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
                xfs_log_move_tail(ailp->xa_mount, 1);
 }       /* xfs_trans_unlocked_item */
 /*
- * Update the position of the item in the AIL with the new
+ * xfs_trans_ail_update - bulk AIL insertion operation.
- * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
+ *
- * it to its new position by removing it and re-adding it.
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
 *
- * Wakeup anyone with an lsn less than the item's lsn.  If the item
+ * This function takes the AIL lock once to execute the update operations on
- * we move in the AIL is the minimum one, update the tail lsn in the
+ * all the items in the array, and as such should not be called with the AIL
- * log manager.
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
 *
- * This function must be called with the AIL lock held.  The lock
+ * To optimise the insert operation, we delete all the items from the AIL in
- * is dropped before returning.
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
 */
 void
-xfs_trans_ail_update(
+xfs_trans_ail_update_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip,
+        struct xfs_log_item     **log_items,
-        xfs_lsn_t       lsn) __releases(ailp->xa_lock)
+        int                     nr_items,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip = NULL;
+        xfs_log_item_t          *mlip;
-        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
+        LIST_HEAD(tmp);
        mlip = xfs_ail_min(ailp);
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        for (i = 0; i < nr_items; i++) {
-                dlip = xfs_ail_delete(ailp, lip);
+                struct xfs_log_item *lip = log_items[i];
-                ASSERT(dlip == lip);
+                if (lip->li_flags & XFS_LI_IN_AIL) {
-                xfs_trans_ail_cursor_clear(ailp, dlip);
+                        /* check if we really need to move the item */
-        } else {
+                        if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
-                lip->li_flags |= XFS_LI_IN_AIL;
+                                continue;
+                        xfs_ail_delete(ailp, lip);
+                        if (mlip == lip)
+                                mlip_changed = 1;
+                } else {
+                        lip->li_flags |= XFS_LI_IN_AIL;
+                }
+                lip->li_lsn = lsn;
+                list_add(&lip->li_ail, &tmp);
        }
-        lip->li_lsn = lsn;
+        xfs_ail_splice(ailp, &tmp, lsn);
-        xfs_ail_insert(ailp, lip);
-        if (mlip == dlip) {
+        if (!mlip_changed) {
-                mlip = xfs_ail_min(ailp);
-                /*
-                 * It is not safe to access mlip after the AIL lock is
-                 * dropped, so we must get a copy of li_lsn before we do
-                 * so.  This is especially important on 32-bit platforms
-                 * where accessing and updating 64-bit values like li_lsn
-                 * is not atomic.
-                 */
-                tail_lsn = mlip->li_lsn;
-                spin_unlock(&ailp->xa_lock);
-                xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-        } else {
                spin_unlock(&ailp->xa_lock);
+                return;
        }
+        /*
-}       /* xfs_trans_update_ail */
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip->li_lsn;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
- * Delete the given item from the AIL.  It must already be in
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
- * the AIL.
 *
- * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
- * we delete in the AIL is the minimum one, update the tail lsn in the
+ * removed from the AIL. The caller is already holding the AIL lock, and done
- * log manager.
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
 *
- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
- * bump the AIL's generation count to indicate that the tree
+ * flag from the item and reset the item's lsn to 0. If we remove the first
- * has changed.
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
 *
- * This function must be called with the AIL lock held.  The lock
+ * This function will not drop the AIL lock until all items are removed from
- * is dropped before returning.
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
 */
 void
-xfs_trans_ail_delete(
+xfs_trans_ail_delete_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+        struct xfs_log_item     **log_items,
+        int                     nr_items) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        mlip = xfs_ail_min(ailp);
-                mlip = xfs_ail_min(ailp);
-                dlip = xfs_ail_delete(ailp, lip);
-                ASSERT(dlip == lip);
-                xfs_trans_ail_cursor_clear(ailp, dlip);
-                lip->li_flags &= ~XFS_LI_IN_AIL;
+        for (i = 0; i < nr_items; i++) {
-                lip->li_lsn = 0;
+                struct xfs_log_item *lip = log_items[i];
+                if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+                        struct xfs_mount        *mp = ailp->xa_mount;
-                if (mlip == dlip) {
-                        mlip = xfs_ail_min(ailp);
-                        /*
-                         * It is not safe to access mlip after the AIL lock
-                         * is dropped, so we must get a copy of li_lsn
-                         * before we do so.  This is especially important
-                         * on 32-bit platforms where accessing and updating
-                         * 64-bit values like li_lsn is not atomic.
-                         */
-                        tail_lsn = mlip ? mlip->li_lsn : 0;
-                        spin_unlock(&ailp->xa_lock);
-                        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-                } else {
                        spin_unlock(&ailp->xa_lock);
+                        if (!XFS_FORCED_SHUTDOWN(mp)) {
+                                xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+                "%s: attempting to delete a log item that is not in the AIL",
+                                                __func__);
+                                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                        }
+                        return;
                }
+                xfs_ail_delete(ailp, lip);
+                lip->li_flags &= ~XFS_LI_IN_AIL;
+                lip->li_lsn = 0;
+                if (mlip == lip)
+                        mlip_changed = 1;
        }
-        else {
-                /*
-                 * If the file system is not being shutdown, we are in
-                 * serious trouble if we get to this stage.
-                 */
-                struct xfs_mount        *mp = ailp->xa_mount;
+        if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
-                if (!XFS_FORCED_SHUTDOWN(mp)) {
+                return;
-                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-                "%s: attempting to delete a log item that is not in the AIL",
-                                        __func__);
-                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                }
        }
-}
+        /*
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic. It is possible we've emptied the
+         * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip ? mlip->li_lsn : 0;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
 * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
 }
 /*
- * Insert the given log item into the AIL.
+ * splice the log item list into the AIL at the given LSN.
- * We almost always insert at the end of the list, so on inserts
- * we search from the end of the list to find where the
- * new item belongs.
 */
 STATIC void
-xfs_ail_insert(
+xfs_ail_splice(
        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
+        struct list_head *list,
-/* ARGSUSED */
+        xfs_lsn_t       lsn)
 {
        xfs_log_item_t  *next_lip;
@@ -640,39 +660,33 @@ xfs_ail_insert(
         * If the list is empty, just insert the item.
         */
        if (list_empty(&ailp->xa_ail)) {
-                list_add(&lip->li_ail, &ailp->xa_ail);
+                list_splice(list, &ailp->xa_ail);
                return;
        }
        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-                if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
                        break;
        }
        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
+               (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
-        list_add(&lip->li_ail, &next_lip->li_ail);
-        xfs_ail_check(ailp, lip);
+        list_splice_init(list, &next_lip->li_ail);
        return;
 }
 /*
 * Delete the given item from the AIL.  Return a pointer to the item.
 */
-/*ARGSUSED*/
+STATIC void
-STATIC xfs_log_item_t *
 xfs_ail_delete(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        xfs_ail_check(ailp, lip);
        list_del(&lip->li_ail);
+        xfs_trans_ail_cursor_clear(ailp, lip);
-        return lip;
 }
 /*
@@ -682,7 +696,6 @@ xfs_ail_delete(
 STATIC xfs_log_item_t *
 xfs_ail_min(
        struct xfs_ail  *ailp)
-/* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
                return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
 xfs_ail_next(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        if (lip->li_ail.next == &ailp->xa_ail)
                return NULL;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 90af025e6839..c47918c302a5 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -336,7 +336,7 @@ xfs_trans_read_buf(
                        ASSERT(!XFS_BUF_ISASYNC(bp));
                        XFS_BUF_READ(bp);
                        xfsbdstrat(tp->t_mountp, bp);
-                        error = xfs_iowait(bp);
+                        error = xfs_buf_iowait(bp);
                        if (error) {
                                xfs_ioerror_alert("xfs_trans_read_buf", mp,
                                                  bp, blkno);
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa70..f7590f5badea 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-        next_extent = efip->efi_next_extent;
+        /*
+         * atomic_inc_return gives us the value after the increment;
+         * we want to use it as an array index so we need to subtract 1 from
+         * it.
+         */
+        next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
        ASSERT(next_extent < efip->efi_format.efi_nextents);
        extp = &(efip->efi_format.efi_extents[next_extent]);
        extp->ext_start = start_block;
        extp->ext_len = ext_len;
-        efip->efi_next_extent++;
 }
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdc53a1050c5..ccb34532768b 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -118,6 +118,36 @@ xfs_trans_ijoin_ref(
 }
 /*
+ * Transactional inode timestamp update. Requires the inode to be locked and
+ * joined to the transaction supplied. Relies on the transaction subsystem to
+ * track dirty state and update/writeback the inode accordingly.
+ */
+void
+xfs_trans_ichgtime(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        int                     flags)
+{
+        struct inode            *inode = VFS_I(ip);
+        timespec_t              tv;
+        ASSERT(tp);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        ASSERT(ip->i_transp == tp);
+        tv = current_fs_time(inode->i_sb);
+        if ((flags & XFS_ICHGTIME_MOD) &&
+            !timespec_equal(&inode->i_mtime, &tv)) {
+                inode->i_mtime = tv;
+        }
+        if ((flags & XFS_ICHGTIME_CHG) &&
+            !timespec_equal(&inode->i_ctime, &tv)) {
+                inode->i_ctime = tv;
+        }
+}
+/*
 * This is called to mark the fields indicated in fieldmask as needing
 * to be logged when the transaction is committed.  The inode must
 * already be associated with the given transaction.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de5..35162c238fa3 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
 void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void    xfs_trans_del_item(struct xfs_log_item *);
 void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
                                int flags);
-void    xfs_trans_item_committed(struct xfs_log_item *lip,
-                                xfs_lsn_t commit_lsn, int aborted);
 void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+void    xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+                                xfs_lsn_t commit_lsn, int aborted);
 /*
 * AIL traversal cursor.
 *
@@ -73,12 +75,29 @@ struct xfs_ail {
 /*
 * From xfs_trans_ail.c
 */
-void                    xfs_trans_ail_update(struct xfs_ail *ailp,
+void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
-                                        struct xfs_log_item *lip, xfs_lsn_t lsn)
+                                struct xfs_log_item **log_items, int nr_items,
-                                        __releases(ailp->xa_lock);
+                                xfs_lsn_t lsn) __releases(ailp->xa_lock);
-void                    xfs_trans_ail_delete(struct xfs_ail *ailp,
+static inline void
-                                        struct xfs_log_item *lip)
+xfs_trans_ail_update(
-                                        __releases(ailp->xa_lock);
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+}
+void    xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+                                struct xfs_log_item **log_items, int nr_items)
+                                __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
 void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
 void                    xfs_trans_unlocked_item(struct xfs_ail *,
                                        xfs_log_item_t *);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 320775295e32..26d1867d8156 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
 typedef __uint32_t      xfs_dablk_t;    /* dir/attr block number (in file) */
 typedef __uint32_t      xfs_dahash_t;   /* dir/attr hash value */
-typedef __uint16_t      xfs_prid_t;     /* prid_t truncated to 16bits in XFS */
 typedef __uint32_t      xlog_tid_t;     /* transaction ID type */
 /*
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index b7d5769d2df0..8b32d1a4c5a1 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -56,7 +56,6 @@ xfs_dir_ialloc(
        mode_t          mode,
        xfs_nlink_t     nlink,
        xfs_dev_t       rdev,
-        cred_t          *credp,
        prid_t          prid,           /* project id */
        int             okalloc,        /* ok to allocate new space */
        xfs_inode_t     **ipp,          /* pointer to inode; it will be
@@ -93,7 +92,7 @@ xfs_dir_ialloc(
         * transaction commit so that no other process can steal
         * the inode(s) that we've just allocated.
         */
-        code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc,
+        code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
                          &ialloc_context, &call_again, &ip);
        /*
@@ -197,7 +196,7 @@ xfs_dir_ialloc(
                 * other allocations in this allocation group,
                 * this call should always succeed.
                 */
-                code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid,
+                code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
                                  okalloc, &ialloc_context, &call_again, &ip);
                /*
@@ -235,7 +234,7 @@ xfs_droplink(
 {
        int     error;
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        ASSERT (ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink--;
@@ -299,7 +298,7 @@ xfs_bumplink(
 {
        if (ip->i_d.di_nlink >= XFS_MAXLINK)
                return XFS_ERROR(EMLINK);
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        ASSERT(ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink++;
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f55b9678264f..456fca314933 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -19,8 +19,7 @@
 #define __XFS_UTILS_H__
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
-                                xfs_dev_t, cred_t *, prid_t, int,
+                                xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
-                                xfs_inode_t **, int *);
 extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
 extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
 extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4c7c7bfb2b2f..d8e6f8cd6f0c 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -114,7 +114,7 @@ xfs_setattr(
                 */
                ASSERT(udqp == NULL);
                ASSERT(gdqp == NULL);
-                code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
+                code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
                                         qflags, &udqp, &gdqp);
                if (code)
                        return code;
@@ -184,8 +184,11 @@ xfs_setattr(
                    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
                        lock_flags &= ~XFS_ILOCK_EXCL;
-                        if (mask & ATTR_CTIME)
+                        if (mask & ATTR_CTIME) {
-                                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                                inode->i_mtime = inode->i_ctime =
+                                                current_fs_time(inode->i_sb);
+                                xfs_mark_inode_dirty_sync(ip);
+                        }
                        code = 0;
                        goto error_return;
                }
@@ -961,29 +964,48 @@ xfs_release(
                        xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
        }
-        if (ip->i_d.di_nlink != 0) {
+        if (ip->i_d.di_nlink == 0)
-                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                return 0;
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                    (!(ip->i_d.di_flags &
-                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                        /*
+        if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                         * If we can't get the iolock just skip truncating
+             ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                         * the blocks past EOF because we could deadlock
+               ip->i_delayed_blks > 0)) &&
-                         * with the mmap_sem otherwise.  We'll get another
+             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                         * chance to drop them once the last reference to
+            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                         * the inode is dropped, so we'll never leak blocks
-                         * permanently.
+                /*
-                         */
+                 * If we can't get the iolock just skip truncating the blocks
-                        error = xfs_free_eofblocks(mp, ip,
+                 * past EOF because we could deadlock with the mmap_sem
-                                                   XFS_FREE_EOF_TRYLOCK);
+                 * otherwise.  We'll get another chance to drop them once the
-                        if (error)
+                 * last reference to the inode is dropped, so we'll never leak
-                                return error;
+                 * blocks permanently.
-                }
+                 *
-        }
+                 * Further, check if the inode is being opened, written and
+                 * closed frequently and we have delayed allocation blocks
+                 * oustanding (e.g. streaming writes from the NFS server),
+                 * truncating the blocks past EOF will cause fragmentation to
+                 * occur.
+                 *
+                 * In this case don't do the truncation, either, but we have to
+                 * be careful how we detect this case. Blocks beyond EOF show
+                 * up as i_delayed_blks even when the inode is clean, so we
+                 * need to truncate them away first before checking for a dirty
+                 * release. Hence on the first dirty close we will still remove
+                 * the speculative allocation, but after that we will leave it
+                 * in place.
+                 */
+                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                        return 0;
+                error = xfs_free_eofblocks(mp, ip,
+                                           XFS_FREE_EOF_TRYLOCK);
+                if (error)
+                        return error;
+                /* delalloc blocks after truncation means it really is dirty */
+                if (ip->i_delayed_blks)
+                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+        }
        return 0;
 }
@@ -1253,8 +1275,7 @@ xfs_create(
        struct xfs_name         *name,
        mode_t                  mode,
        xfs_dev_t               rdev,
-        xfs_inode_t             **ipp,
+        xfs_inode_t             **ipp)
-        cred_t                  *credp)
 {
        int                     is_dir = S_ISDIR(mode);
        struct xfs_mount        *mp = dp->i_mount;
@@ -1266,7 +1287,7 @@ xfs_create(
        boolean_t               unlock_dp_on_error = B_FALSE;
        uint                    cancel_flags;
        int                     committed;
-        xfs_prid_t              prid;
+        prid_t                  prid;
        struct xfs_dquot        *udqp = NULL;
        struct xfs_dquot        *gdqp = NULL;
        uint                    resblks;
@@ -1279,9 +1300,9 @@ xfs_create(
                return XFS_ERROR(EIO);
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-                prid = dp->i_d.di_projid;
+                prid = xfs_get_projid(dp);
        else
-                prid = dfltprid;
+                prid = XFS_PROJID_DEFAULT;
        /*
         * Make sure that we have allocated dquot(s) on disk.
@@ -1360,7 +1381,7 @@ xfs_create(
         * entry pointing to them, but a directory also the "." entry
         * pointing to itself.
         */
-        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
+        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
                               prid, resblks > 0, &ip, &committed);
        if (error) {
                if (error == ENOSPC)
@@ -1391,7 +1412,7 @@ xfs_create(
                ASSERT(error != ENOSPC);
                goto out_trans_abort;
        }
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        if (is_dir) {
@@ -1742,7 +1763,7 @@ xfs_remove(
                ASSERT(error != ENOENT);
                goto out_bmap_cancel;
        }
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        if (is_dir) {
                /*
@@ -1880,7 +1901,7 @@ xfs_link(
         * the tree quota mechanism could be circumvented.
         */
        if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
+                     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
                error = XFS_ERROR(EXDEV);
                goto error_return;
        }
@@ -1895,7 +1916,7 @@ xfs_link(
                                        &first_block, &free_list, resblks);
        if (error)
                goto abort_return;
-        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
        error = xfs_bumplink(tp, sip);
@@ -1933,8 +1954,7 @@ xfs_symlink(
        struct xfs_name         *link_name,
        const char              *target_path,
        mode_t                  mode,
-        xfs_inode_t             **ipp,
+        xfs_inode_t             **ipp)
-        cred_t                  *credp)
 {
        xfs_mount_t             *mp = dp->i_mount;
        xfs_trans_t             *tp;
@@ -1955,7 +1975,7 @@ xfs_symlink(
        int                     byte_cnt;
        int                     n;
        xfs_buf_t               *bp;
-        xfs_prid_t              prid;
+        prid_t                  prid;
        struct xfs_dquot        *udqp, *gdqp;
        uint                    resblks;
@@ -1978,9 +1998,9 @@ xfs_symlink(
        udqp = gdqp = NULL;
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-                prid = dp->i_d.di_projid;
+                prid = xfs_get_projid(dp);
        else
-                prid = (xfs_prid_t)dfltprid;
+                prid = XFS_PROJID_DEFAULT;
        /*
         * Make sure that we have allocated dquot(s) on disk.
@@ -2046,8 +2066,8 @@ xfs_symlink(
        /*
         * Allocate an inode for the symlink.
         */
-        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
+        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
-                               1, 0, credp, prid, resblks > 0, &ip, NULL);
+                               prid, resblks > 0, &ip, NULL);
        if (error) {
                if (error == ENOSPC)
                        goto error_return;
@@ -2129,7 +2149,7 @@ xfs_symlink(
                                        &first_block, &free_list, resblks);
        if (error)
                goto error1;
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        /*
@@ -2272,7 +2292,7 @@ xfs_alloc_file_space(
        count = len;
        imapp = &imaps[0];
        nimaps = 1;
-        bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
+        bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
        startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
        allocatesize_fsb = XFS_B_TO_FSB(mp, count);
@@ -2431,9 +2451,9 @@ xfs_zero_remaining_bytes(
        if (endoff > ip->i_size)
                endoff = ip->i_size;
-        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
+        bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                                XFS_IS_REALTIME_INODE(ip) ?
+                                        mp->m_rtdev_targp : mp->m_ddev_targp,
-                                mp->m_rtdev_targp : mp->m_ddev_targp);
+                                mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
        if (!bp)
                return XFS_ERROR(ENOMEM);
@@ -2459,7 +2479,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_READ(bp);
                XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
                xfsbdstrat(mp, bp);
-                error = xfs_iowait(bp);
+                error = xfs_buf_iowait(bp);
                if (error) {
                        xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
                                          mp, bp, XFS_BUF_ADDR(bp));
@@ -2472,7 +2492,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_UNREAD(bp);
                XFS_BUF_WRITE(bp);
                xfsbdstrat(mp, bp);
-                error = xfs_iowait(bp);
+                error = xfs_buf_iowait(bp);
                if (error) {
                        xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
                                          mp, bp, XFS_BUF_ADDR(bp));
@@ -2711,6 +2731,7 @@ xfs_change_file_space(
        xfs_off_t       llen;
        xfs_trans_t     *tp;
        struct iattr    iattr;
+        int             prealloc_type;
        if (!S_ISREG(ip->i_d.di_mode))
                return XFS_ERROR(EINVAL);
@@ -2753,12 +2774,17 @@ xfs_change_file_space(
         * size to be changed.
         */
        setprealloc = clrprealloc = 0;
+        prealloc_type = XFS_BMAPI_PREALLOC;
        switch (cmd) {
+        case XFS_IOC_ZERO_RANGE:
+                prealloc_type |= XFS_BMAPI_CONVERT;
+                xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
+                /* FALLTHRU */
        case XFS_IOC_RESVSP:
        case XFS_IOC_RESVSP64:
                error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-                                                                1, attr_flags);
+                                                prealloc_type, attr_flags);
                if (error)
                        return error;
                setprealloc = 1;
@@ -2827,7 +2853,7 @@ xfs_change_file_space(
                if (ip->i_d.di_mode & S_IXGRP)
                        ip->i_d.di_mode &= ~S_ISGID;
-                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        }
        if (setprealloc)
                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d8dfa8d0dadd..f6702927eee4 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,7 +2,6 @@
 #define _XFS_VNODEOPS_H 1
 struct attrlist_cursor_kern;
-struct cred;
 struct file;
 struct iattr;
 struct inode;
@@ -26,7 +25,7 @@ int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode **ipp, struct xfs_name *ci_name);
 int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
-                xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp);
+                xfs_dev_t rdev, struct xfs_inode **ipp);
 int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -34,8 +33,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 int xfs_readdir(struct xfs_inode        *dp, void *dirent, size_t bufsize,
                       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
-                const char *target_path, mode_t mode, struct xfs_inode **ipp,
+                const char *target_path, mode_t mode, struct xfs_inode **ipp);
-                cred_t *credp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
                xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);